punkshell/src/bootsupport/modules/punk/char-0.1.0.tm

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2023
#
# @@ Meta Begin
# Application punk::char 0.1.0
# Meta platform     tcl
# Meta license      <unspecified>
# @@ Meta End

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::char 0 0.1.0]
#[copyright "2024"]
#[titledesc {character-set and unicode utilities}]  [comment {-- Name section and table of contents description --}]
#[moddesc {character-set nad unicode}]             [comment {-- Description at end of page heading --}]
#[require punk::char]
#[keywords module encodings]
#[description]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section Overview]
#[para] overview of punk::char
#[subsection Concepts]
#[para] -


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Requirements
##e.g package require frobz

#*** !doctools
#[subsection dependencies]
#[para] packages used by punk::char
#[list_begin itemized]
#[item] [package {Tcl 8.6}]

#

#*** !doctools
#[item] [package {overtype}]
#[para] -
#[item] [package {textblock}]
#[para] -
#[item] [package console]
#[para] -

package require Tcl 8.6-
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
package require textutil
package require textutil::wcswidth

#*** !doctools
#[list_end]

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++

#*** !doctools
#[section API]

#Note that ansi escapes can begin with \033\[ (\u001b\[)  or the single character "Control Sequence Introducer" 0x9b

# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::char {
    tcl::namespace::export *

    variable grapheme_widths [tcl::dict::create]
    # -- --------------------------------------------------------------------------
    variable encmimens  ;#namespace of mime package providing reversemapencoding and mapencoding functions
    #tcllib mime requires tcl::chan::memchan,events,core and/or Trf
    if {![catch {package require punk::encmime} errM]} {
        set encmimens ::punk::encmime
    } else {
        package require mime
        set encmimens ::mime
    }
    # -- --------------------------------------------------------------------------

    variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
    variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?


    #more useful for referring to ANSI documentation would be a proper 7-bit and 8-bit 'Code Table' layout
    #as described in ECMA-35 5.2
    # where the positions of the table are in one-to-one correspondence with the bit combinations of the code.
    #-  for 7-bit: 8 columns 16 rows
    #-  for 8-bit 16 columns 16 rows
    proc codetable {which} {
        set bits 8
        switch -- $which {
            ascii8 {
                set which default
            }
            ascii {
                set bits 7
            }
            default {
                if {$which ni [encoding names]} {
                    error "codetable unsupported - use 'ascii' or an entry from the result of the 'encoding names' command."
                }
            }
        }
        package require punk::ansi

        set hibit_count [expr {$bits-4}]
        set bitcolumns [expr {2**$hibit_count}] ;#always 4 bits for the rows - remaining bits for the columns
        set columncount [expr {$bitcolumns + 6}]


        #set header1 [list "" "b7\nb6\nb5" "0\n0\n0" "0\n0\n1" "0\n1\n0" "0\n1\n1" "1\n0\n0" "1\n0\n1" "1\n1\n0" "1\n1\n1"]
        set header1 [list]
        set hibits_label ""
        set indent ""
        for {set hb $bits} {$hb > 4} {incr hb -1} {
            append hibits_label ${indent}b$hb\n
            append indent "  "
        }
        set hibits_label [string range $hibits_label 0 end-1]
        lappend header1 $hibits_label "" "" "" "" ""

        for {set colidx 0} {$colidx < $bitcolumns} {incr colidx} {
            set binval [format %0${hibit_count}b $colidx]
            set binvalbits [split $binval ""]
            set indent  ""
            set display_hibits ""
            foreach bb $binvalbits {
                append display_hibits $indent$bb\n
                append indent " "
            }
            set display_hibits [string range $display_hibits 0 end-1]
            lappend header1 $display_hibits
        }
        #\u2193 down arrow
        #right-down arrows
        #\u2ba7
        #\u21b4
        #\u2b0e
        set header2 [list "   Bits" b4 b3 b2 b1 "column \u2192\nrow \u2b0e" {*}[punk::lib::range 0 $bitcolumns-1]]
        set headers [list $header1 $header2]

        #set t [textblock::table -return tableobject -rows $rows]
        set t [textblock::table -return tableobject]
        #todo - fix textblock::table to allow configure -columncount
        for {set c 0} {$c < $columncount} {incr c} {
            $t add_column
        }

        set vheaders [punk::transpose_equal_lists $headers]
        set hidx -1
        foreach vh $vheaders {
            incr hidx
            $t configure_column $hidx -headers $vh
        }
        $t configure_header 0 -colspans [list 6 0 0 0 0 0 {*}[lrepeat $bitcolumns 1]]
        $t configure_column 0 -blockalign left

        #always 16 rows - remaining bits form the columns
        for {set ridx 0} {$ridx <= 15} {incr ridx} {
            set charlist [list]
            set lowbits [format %04b $ridx]
            for {set i 0} {$i < $bitcolumns} {incr i} {
                set hibits [format %0${hibit_count}b $i]
                set ch [format %c [scan ${hibits}${lowbits} %b]]
                #puts "-->${hibits}${lowbits} ch:$ch"
                if {$which ne "default"} {
                    if {[catch {encoding convertfrom $which $ch} ch]} {
                        set ch [punk::ansi::a red bold]-[punk::ansi::a]
                        lappend charlist $ch
                    } else {
                        lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch]
                    }
                } else {
                    lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch]
                }
            }
            set r [list "" {*}[split $lowbits ""] $ridx {*}$charlist]
            $t add_row $r
        }
        set result [$t print]
        $t destroy
        return $result
    }

    #just the 7-bit ascii. use [page ascii] for the 8-bit layout
    proc ascii {} {return {
        00 NUL  01 SOH  02 STX  03 ETX  04 EOT  05 ENQ  06 ACK  07 BEL
        08 BS   09 HT   0a LF   0b VT   0c FF   0d CR   0e SO   0f SI
        10 DLE  11 DC1  12 DC2  13 DC3  14 DC4  15 NAK  16 SYN  17 ETB
        18 CAN  19 EM   1a SUB  1b ESC  1c FS   1d GS   1e RS   1f US
        20 SP   21  !   22  "   23  #   24  $   25  %   26  &   27  '
        28  (   29  )   2a  *   2b  +   2c  ,   2d  -   2e  .   2f  /
        30  0   31  1   32  2   33  3   34  4   35  5   36  6   37  7
        38  8   39  9   3a  :   3b  ;   3c  <   3d  =   3e  >   3f  ?
        40  @   41  A   42  B   43  C   44  D   45  E   46  F   47  G
        48  H   49  I   4a  J   4b  K   4c  L   4d  M   4e  N   4f  O
        50  P   51  Q   52  R   53  S   54  T   55  U   56  V   57  W
        58  X   59  Y   5a  Z   5b  [   5c  \   5d  ]   5e  ^   5f  _
        60  `   61  a   62  b   63  c   64  d   65  e   66  f   67  g
        68  h   69  i   6a  j   6b  k   6c  l   6d  m   6e  n   6f  o
        70  p   71  q   72  r   73  s   74  t   75  u   76  v   77  w
        78  x   79  y   7a  z   7b  {   7c  |   7d  }   7e  ~   7f DEL
    }}

    #G0 character set
    proc ascii2 {} {
        set dict [asciidict2]
        set out ""
        set i 1
        append out "        "
        tcl::dict::for {k v} $dict {
            #single chars are wrapped with \033(0  and \033(B  ie total length 7
            if {[tcl::string::length $v] == 7} {
                set v " $v "
            } elseif {[tcl::string::length $v] == 2} {
                set v "$v "
            } elseif {[tcl::string::length $v] == 0} {
                set v "   "
            }
            append out "$k $v  "
            if {$i > 0 && $i % 8 == 0} {
                set out [tcl::string::range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        set out [tcl::string::trimright $out " "]
        return $out
    }

    namespace eval argdoc {
        variable PUNKARGS
        lappend PUNKARGS [list {
            @id     -id     ::punk::char::ascii2doublewide
            @cmd    -name     punk::char::ascii2doublewide\
                -summary\
                "string to double-wide unicode char"\
                -help\
                "Convert parts of string in the ascii range 21 to 7E to corresponding chars from the unicode
                'Halfwidth and Fullwidth Forms' block.
                The space character (0x20) is converted to the 'IDEOGRAPHIC SPACE' character (0x3000)
                Control chars and chars outside the ascii range are not converted and passed through as-is."
            @values -min 1 -max 1
            str -type string
        }]
    }
    proc ascii2doublewide {str} {
        #set base to \UFEE0 and add ascii value of char to get the double-wide character in the 'Halfwidth and Fullwidth Forms' block of unicode
        set out ""
        foreach ch [split $str ""] {
            set decval [scan $ch %c]
            if {$decval == 0x20} {
                #space char maps to ideographic space rather than the fullwidth space char - as the fullwidth space char is often rendered as a narrow space rather than a wide one - and the ideographic space is more likely to be rendered as a wide space.
                set decval 0x3000
            } elseif {$decval < 0x21 || $decval > 0x7E} {
                #chars outside the ascii range are not converted - pass through as-is
                set decval [scan $ch %c]
            } else {
                set decval [expr {0xFEE0 + $decval}]
            }
            append out [format %c $decval]
        }
        return $out
    }
    proc doublewide2ascii {str} {
        set out ""
        foreach ch [split $str ""] {
            set decval [scan $ch %c]
            if {$decval == 0x3000} {
                #ideographic space maps to ascii space char
                set decval 0x20
                append out \U3000
            } elseif {$decval < 0xFF01 || $decval > 0xFF5E} {
                #chars outside the part of the 'Halfwidth and Fullwidth Forms' block that corresponds to ascii are not converted - pass through as-is
                append out $ch
            } else {
                set decval [expr {$decval - 0xFEE0}]
                append out [format %c $decval]
            }
        }
        return $out
    }

    proc ascii2NTFSPUA {str} {
        #set base to \uF000 and add ascii value of char to get the PUA character in the 'Private Use Area' block of unicode
        set out ""
        foreach ch [split $str ""] {
            set decval [scan $ch %c]
            if {$decval <= 0x7F} {
                append out [format %c [expr {0xF000 + $decval}]]
            } else {
                #chars outside the ascii range are not converted - pass through as-is
                append out $ch
            }
        }
        return $out
    }
    proc NTFSPUA2ascii {str} {
        set out ""
        foreach ch [split $str ""] {
            set decval [scan $ch %c]
            if {$decval >= 0xF000 && $decval <= 0xF0FF} {
                set decval [expr {$decval - 0xF000}]
                append out [format %c $decval]
            } else {
                #chars outside the 'Private Use Area' block that corresponds to ascii are not converted - pass through as-is
                append out $ch
            }
        }
        return $out
    }


    proc symbol {} {
        tailcall page symbol
    }
    proc dingbats {} {
        set out ""
        append out [page dingbats] \n
        set unicode_dict [charset_dictget Dingbats]

        append out "        "
        set i 1
        tcl::dict::for {k charinfo} $unicode_dict {
            set char [tcl::dict::get $charinfo char]
            if {[tcl::string::length $char] == 0} {
                set displayv "   "
            } elseif {[tcl::string::length $char] == 1} {
                set displayv " $char "
            } else {
                set displayv $char
            }
            append out "$k $displayv  "
            if {$i > 0 && $i % 8 == 0} {
                set out [tcl::string::range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        return $out
    }
    proc page_names {{search *}} {
        set all_names [list]
        set d [page_names_dict $search]
        tcl::dict::for {k v} $d {
            if {$k ni $all_names} {
                lappend all_names $k
            }
            foreach m $v {
                if {$m ni $all_names} {
                    lappend all_names $m
                }
            }
        }
        return [lsort $all_names]
    }
    proc page_names_help {{namesearch *}} {
        set d [page_names_dict $namesearch]

        set out ""
        tcl::dict::for {k v} $d {
            append out "$k $v" \n
        }
        return [linesort $out]
    }
    proc page_names_dict {{search *}} {
        if {![regexp {[?*]} $search]} {
            set search "*$search*"
        }
        set encnames [encoding names]
        foreach enc $encnames {
            tcl::dict::set d $enc [list]
        }
        variable encmimens
        set mimenames [array get ${encmimens}::reversemap]
        tcl::dict::for {mname encname} $mimenames {
            if {$encname in $encnames} {
                set enclist [tcl::dict::get $d $encname]
                if {$mname ni $enclist} {
                    tcl::dict::lappend d $encname $mname
                }
            }
        }
        foreach enc [lsort $encnames] {
            set mime_enc [${encmimens}::mapencoding $enc]
            if {$mime_enc ne ""} {
                set enclist [tcl::dict::get $d $enc]
                if {$mime_enc ni $enclist} {
                    tcl::dict::lappend d $enc $mime_enc
                }
            }
        }
        set dresult [tcl::dict::create]
        if {$search ne "*"} {
            tcl::dict::for {k v} $d {
                if {[tcl::string::match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} {
                    tcl::dict::set dresult $k $v
                }
            }
        } else {
            set dresult $d
        }
        return $dresult
    }
    proc page8 {encname args} {
        tcl::dict::set args -cols 8
        tailcall page $encname {*}$args
    }
    proc page16 {encname args} {
        tcl::dict::set args -cols 16
        tailcall page $encname {*}$args
    }

    #This will not display for example, c0 glyphs for cp437
    # we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
    # for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
    proc page {encname args} {
        variable invalid
        set encname [encname $encname]
        set defaults [list {*}{
            -range  {0 256}
            -cols   16
        }]
        set opts [tcl::dict::merge $defaults $args]
        # -- --- --- --- --- --- --- --- ---
        set cols            [tcl::dict::get $opts -cols]
        # -- --- --- --- --- --- --- --- ---

        set d_bytedisplay [basedict_display]

        #set d_ascii [pagedict_raw ascii]
        set d_ascii [basedict]
        set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
        #The results of this are best seen by comparing the ebcdic and ascii pages

        set d_page  [pagedict_raw $encname]

        set out ""
        set i 1
        append out "        "
        tcl::dict::for {k rawchar} $d_page {
            set num [expr {"0x$k"}]
            #see if ascii equivalent exists and has a name
            if {$rawchar eq $invalid} {
                set displayv "$invalid"
            } else {
                set bytedisplay ""
                if {[tcl::dict::exists $d_asciiposn $rawchar]} {
                    set asciiposn [tcl::dict::get $d_asciiposn $rawchar]
                    set bytedisplay [tcl::dict::get $d_bytedisplay $asciiposn]
                }
                if {$bytedisplay eq $invalid} {
                    #
                    set displayv " $rawchar "
                } else {
                    set displaylen [tcl::string::length $bytedisplay]
                    if {$displaylen == 2} {
                        set displayv "$bytedisplay "
                    } elseif {$displaylen == 3} {
                        set displayv $bytedisplay
                    } else {
                        if {[tcl::string::length $rawchar] == 0} {
                            set displayv "   "
                        } else {
                            #presumed 1
                            set displayv " $rawchar "
                        }
                    }
                }
            }

            append out "$k $displayv  "
            if {$i > 0 && $i % $cols == 0} {
                set out [tcl::string::range $out 0 end-2]
                append out \n "        "
            }
            incr i
        }
        set out [tcl::string::trimright $out " "]
        return $out
    }

    proc pagechar1 {page num} {
        set encpage [encname $page]
        encoding convertfrom $encpage [format %c $num]
    }

    proc pagechar {page num} {
        set encpage [encname $page]

        set ch [format %c $num]
        if {[decodable $ch $encpage]} {
            set outchar [encoding convertfrom $encpage $ch]
        } else {
            #here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
            set outchar $::punk::char::invalid_display_char
        }
        return $outchar
    }
    proc pagechar_info {page num} {
        set ch [format %c $num]
        set h [format %04x $num]
        set encpage [encname $page]
        if {[decodable $ch $encpage]} {
            set outchar [encoding convertfrom $encpage $ch]
        } else {
            error "pagechar_info: $h not decodable from $encpage"
        }
        package require punk::console
        puts -nonewline stdout \033\[s;flush stdout
        lassign [punk::console::get_cursor_pos_list] _row1 col1
        puts -nonewline stdout "$outchar";flush stdout
        lassign [punk::console::get_cursor_pos_list] _row2 col2
        puts -nonewline stdout "\033\[u";flush stdout
        return "$col1 -> $col2"
    }

    proc pagebyte {page num} {
        set encpage [encname $page]

        set ch [format %c $num]
        if {[decodable $ch $encpage]} {
            #set outchar [encoding convertto $encpage [format %c $num]]
            set outchar [format %c $num]
        } else {
            set outchar $::punk::char::invalid_display_char
        }
        return $outchar
    }

    proc all_pages {} {
        set out ""
        set mimenamesdict [page_names_dict]
        foreach encname [encoding names] {
            if {[tcl::dict::exists $mimenamesdict $encname]} {
                set alt "([tcl::dict::get $mimenamesdict $encname])"
            } else {
                set alt ""
            }
            append out "$encname $alt" \n
            append out [page $encname]
        }
        return $out
    }

    proc encname {encoding_name_or_alias} {
        set encname $encoding_name_or_alias
        if {[lsearch -nocase [page_names] $encname] < 0} {
            error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system"
        }
        variable encmimens
        if {$encname ni [encoding names]} {
            set encname [${encmimens}::reversemapencoding $encname]
        }
        return $encname
    }

    proc pagedict_raw {encname} {
        variable invalid ;# ="???"
        set encname [encname $encname]
        set d [tcl::dict::create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            #tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
            set ch [format %c $i] ;
            #jmn
            if {[decodable $ch $encname]} {
                #set encchar [encoding convertto $encname $ch]
                #tcl::dict::set d $k [encoding convertfrom $encchar]
                tcl::dict::set d $k [encoding convertfrom $encname $ch]
            } else {
                tcl::dict::set d $k $invalid ;#use replacement so we can detect difference from actual "?"
            }
        }
        return $d
    }
    proc asciidict {} {
        variable invalid
        set d [tcl::dict::create]
        set a128 [asciidict128]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            if {$i <= 127} {
                tcl::dict::set d $k [tcl::dict::get $a128 $k]
            } else {
                #
                tcl::dict::set d $k $invalid
            }

            if {$i <=32} {
                #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
                tcl::dict::set d $k [tcl::dict::get $a128 $k]
            } else {
                if {$i == 0x9b} {
                    tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
                } else {
                    tcl::dict::set d $k [format %c $i]
                }
            }
        }
        return $d
    }

    proc basedict_display {} {
        set d [tcl::dict::create]
        set a128 [asciidict128]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            if {$i <=32} {
                #no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
                tcl::dict::set d $k [tcl::dict::get $a128 $k]
            } else {
                if {$i == 0x9b} {
                    tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
                } elseif {$i == 0x9c} {
                    tcl::dict::set d $k OSC
                } else {
                    #tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
                    #don't use encoding convertfrom - we want the value independent of the current encoding system.
                    tcl::dict::set d $k [format %c $i]
                }
            }
        }
        return $d
    }
    proc basedict_encoding_system {} {
        #result depends on 'encoding system' currently in effect
        set d [tcl::dict::create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
        }
        return $d
    }

    proc basedict {} {
        #this gives same result independent of current value of 'encoding system'
        set d [tcl::dict::create]
        for {set i 0} {$i < 256} {incr i} {
            set k [format %02x $i]
            tcl::dict::set d $k [format %c $i]
        }
        return $d
    }
    proc pagedict {pagename args} {
        variable  charsets
        set encname [encname $pagename]
        set defaults [list {*}{
            -range      {0 255}
            -charset    ""
        }]
        set opts [tcl::dict::merge $defaults $args]
        # -- --- --- --- --- --- --- --- --- ---
        set range       [tcl::dict::get $opts -range]
        set charset     [tcl::dict::get $opts -charset]
        # -- --- --- --- --- --- --- --- --- ---
        if {$charset ne ""} {
            if {$charset ni [charset_names]} {
                error "unknown charset '$charset' - use 'charset_names' to get list"
            }
            set setinfo [tcl::dict::get $charsets $charset]
            set ranges [tcl::dict::get $setinfo ranges]
            set charset_dict [tcl::dict::create]
            foreach r $ranges {
                set start [tcl::dict::get $r start]
                set end   [tcl::dict::get $r end]
                #set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
                break
            }

        } else {
            set start [lindex $range 0]
            set end  [lindex $range 1]
        }

        set d [tcl::dict::create]
        for {set i $start} {$i <= $end} {incr i} {
            set k [format %02x $i]
            tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
        }
        return $d
    }

    #todo - benchmark peformance - improve punk pipeline
    proc asciidict128 {} {
        regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]]
    }
    proc _asciidict128 {} {
        .= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
    }

    #review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents
    proc asciidict2 {} {
        set d [tcl::dict::create]
        tcl::dict::for {k v} [basedict_display] {
            if {[tcl::string::length $v] == 1} {
                set num [expr {"0x$k"}]
                #tcl::dict::set d $k "\033(0[subst \\u00$k]\033(B"
                tcl::dict::set d $k "\033(0[format %c $num]\033(B"
            } else {
                tcl::dict::set d $k $v
            }
        }
        return $d
    }

    #-- --- --- --- --- --- --- ---
    # encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
    # e.g encoding convertto dingbats <somethingpretty>  will output something that doesn't look dingbatty on screen.
    #-- --- --- --- --- --- --- ---
    #must use Tcl instead of tcl (at least for 8.6)
    if {![package vsatisfies [package present Tcl] 8.7-]} {
        proc encodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            if {($encname eq "ascii")} {
                #8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?)
                #just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true
                set s [tcl::string::map [list [format %c 0x7f] ""] $s]
            }
            string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
        }
        #note also that tcl8.6 has anomalies with how it handles some unassigned codepoints
        # e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?")
        proc decodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            #review
            string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
        }
    } else {
        #review - use -profile?
        proc encodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            if {![catch {
                string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
            } result]} {
                return $result
            } else {
                return 0
            }
        }
        proc decodable "s {enc [encoding system]}" {
            set encname [encname $enc]
            if {![catch {
                string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
            } result]} {
                return $result
            } else {
                return 0
            }
        }
    }
    #-- --- --- --- --- --- --- ---
    proc test_japanese {{encoding jis0208}} {
        #A very basic test of 2char encodings such as jis0208
        set yatbun 日本  ;# encoding convertfrom jis0208 F|K\\
        lassign [split $yatbun] yat bun
        puts "original yatbun ${yat} ${bun}"
        set eyat [encoding convertto $encoding $yat]
        set ebun [encoding convertto $encoding $bun]
        puts "$encoding encoded: ${eyat} ${ebun}"
        puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
        return $yatbun
    }
    proc test_grave {} {
        set g [format %c 0x300]
        puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a."
        puts stdout "Apparent width should theoretically be 1 console-column"
        package require punk::console
        puts stdout "# -- --- --- ---"
        puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn"
        puts stdout "# -- --- --- ---"
        puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos]
        puts stdout \n
        puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn"
    }
    proc test_zalgo {} {
        #from: https://github.com/jameslanska/unicode-display-width/blob/5e28d94c75e8c421a87199363b85c90dc37125b8/docs/unicode_background.md
        #see: https://lingojam.com/ZalgoText
        puts stdout "44 chars long - 9 graphemes - 9 columns wide"

        set str "Ẓ̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍ṭ̓̓ͅ"

    }

    proc test_zalgo2 {} {

        # ------------------------


        set str "Z̸̢͉̣͔̭̪͙̖̳̘͈͍̤̩̟͈͈͕̯̅̏̆̓̌́́͌̿̕ͅą̷̦̤̫̩͕̥̐̎̓́́̂̀͆́̔̄̈́̏̌́̆͜͜͝͠l̴̩̙̺͚̟͇͖͔͕̹̟͌̈́̄̇́̉̋̕̚͜͠͠g̸͇̩͔̺̝̗̥̖̙̑̈́̈́̿̾̌͌͊̈̀͊̑̇̑͌̍̅̌͊͜͠͝ǫ̴̢̧͎͔̞̤̻̱̮̬͕̗̭͍̣̠̳̆̀̋̓͒̾̄͜͝͠͝T̴̛͉̬̋́̈́̈́̓͗̍̏̔̈̋̀͐̓̎̅̚̕͠͝͝ê̵̖̖̈͊̔̓̔̐̓̃͊̋͋̎͂͋̕x̴̡͎͖̼͎̦͎̲̪̘̺̯̲̬̮̥̼̰͌̃͜ͅt̶̛̘͎̰̔͐͌́̈́͊͗͌̅͂̐̆͌̀͂͘"


        # ------------------------
    }

    proc test_zalgo3 {} {

        # ------------------------


       set str "Ẕ̸̢̼̺̜̰̣̣̖̭̜͓͖̺̥̼̠͙͙̟̥̟̠̤̫͍̠̤̮͚̝̜̙͈̦̙̩̹̙̜̩̦͔͕̈̃̅̇̈́͂̇̽̈́́́́̎͊̂̑̋͆̔̾͋̚͜ͅã̵̛̪̽̎̃͑̒̇͋͂̽̃̍͊̀̈̊̊̔̊̆̈́͗͑͗̽̄̋͗̄͌̑͊͝͝͠ͅl̵̢͇̖͉͖̝̹̜̞͓͎͍͈̞̱̙͙̦̪͔̱̮͈͉̼͎̭̝̯͇͚̺̟̱͙̳̰̙͚͖̝̫͙̎̅̃͆̈́̋̌̔̋̋͗̈́̔̐͆̈́̓̾̄̀́̏́͒̆̌͒̈́̈́̾̏̀͜͝g̸̖͂͋̊͗̈́̓̆̈̋̒͐̕o̶̧̢͓̲̗̠̘͕̦̤̹̗͉̰͔̯͓̭̹̻͔͇̯̜̙̍̂̃̃̀̓͌̒͊̊̋̒̿̿̌͐̄͗̾̕͝T̶̛̳̜̰͍̹̻̫̠̱̼̼̙̆̑̾̾͛́́̿͋͌̎̀̀̽̆͌̽̂̈̇̅̇̃́̿͗̾͒̎͊̑̾͝͠ȩ̸̢̨̛̛̛̭͔͎͇̫͎͈̲̻̙͕͎̣͈̩̺̗͖͙͇͌͐̒̎͐̓́̉́͐̓̐͌̅̊͋͑̈́͗͑̏̕͜͜͝ͅx̸̧̧̛̖̣̥̘͎͎̳̭̙̦̝͖̝̮̱̹̯̺̙̩̯̮͖̻̰͓̰̩͇̥̑͌̊̐̉̏͆̓̑̎̊̓͒̂̄̆͆̀̊̄̈̽͐͛̏͊̓̌͑́̎̈́̍̈́̊͗̉̋͆̿̊͘͜͜͝͝ͅͅͅt̵̡̨̜̘͎̦͚̠̗̺͉̼̠̲̠͙̺̹̗̲̏̈́̂̚͜͜͝ͅ"


        # ------------------------

    }


    proc test_farmer {} {
        #an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals
        #(similar to the problem with grave accent rendering width that the test_grave proc is written for)
        # -- --- --- --- ---
        #These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?)
        upvar farmer1_paste test_farmer1
        upvar farmer2_paste test_farmer2
        set test_farmer1 🧑‍🌾   ;#contains zero-width joiner between
        set test_farmer2 🧑🌾
        puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1"
        puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2"
        # -- --- --- --- ---


        set farmer1 "\U0001f9d1\U0000200d\U0001f33e"
        set farmer2 "\U0001f9d1\U0001f33e"
        puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e   : $farmer1"
        puts stdout "farmer2 with no joiner          codes: \\U0001f9d1\\U001f33e               : $farmer2"

        package require punk::console
        puts stdout \n
        puts stdout "#2--5---9---C---"
        puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1
        puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos_list]
        puts stdout "\ncursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide)          : cursor at col [lindex $cursorposn 1]"
        if {[lindex $cursorposn 1] eq "3"} {
            puts stdout "[a+ green]OK[a]"
        } else {
            puts stdout "[a+ red]ERR - expected cursor position to be 3 after emitting farmer1[a]"
        }
        puts stdout "----------------"
        puts stdout "#2--5---9---C---"
        puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos_list]
        puts stdout "\ncursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): cursor at col [lindex $cursorposn 1]"
        if {[lindex $cursorposn 1] eq "5"} {
            puts stdout "[a+ green]OK[a]"
        } else {
            puts stdout "[a+ red]ERR - expected cursor position to be 5 after emitting farmer2[a]"
        }
        puts stdout "----------------"

        puts "returning farmer1 - should be single glyph"
        return $farmer1
    }

    proc _show_graphemes {str} {
        set tk_graphemes [punk::char::grapheme_split_tk $str]
        puts "graphemes from tk:"
        set i 0
        foreach g $tk_graphemes {
            puts " $i: '$g'"
            incr i
        }
        set punk_graphemes [punk::char::grapheme_split $str]
        puts "graphemes from punk::char::grapheme_split:"
        set i 0
        foreach g $punk_graphemes {
            puts " $i: '$g'"
            incr i
        }
    }

    proc test_malayalam1 {} {
        set str "മലയാളം"
        puts "Malayalam string: $str"
        _show_graphemes $str
        puts "$str"
        puts [punk::ansi::mark_columns -number below -marker {| .} {1 5 10} {2 3 4 6 7 8 9}]
        return $str
    }
    proc test_malayalam2 {} {
        set str "സന്തോഷ്"
        puts "Malayalam string: $str"
        _show_graphemes $str
        puts "$str"
        puts [punk::ansi::mark_columns -number below -marker {| .} {1 5 10} {2 3 4 6 7 8 9}]
        return $str
    }

    #G0 Sets Sequence   G1 Sets Sequence   Meaning
    #ESC ( A            ESC ) A            United Kingdom Set
    #ESC ( B            ESC ) B            ASCII Set
    #ESC ( 0            ESC ) 0            Special Graphics
    #ESC ( 1            ESC ) 1            Alternate Character ROM Standard Character Set
    #ESC ( 2            ESC ) 2            Alternate Character ROM Special Graphic

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Unicode character sets - some hardcoded - some loadable from data files
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    variable charinfo [tcl::dict::create]
    variable charsets [tcl::dict::create]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list {*}{
        {start 0        end 127     name "basic latin"}
        {start 128      end 255     name "latin-1 supplement"}
        {start 256      end 383     name "Latin Extended-A"}
        {start 402      end 402     name "subset Latin Extended-B"}
        {start 506      end 511     name "subset Latin Extended-B"}
        {start 710      end 711     name "subset Spacing Modifier Letters"}
        {start 713      end 713     name "subset Spacing Modifier Letters"}
        {start 728      end 733     name "subset Spacing Modifier Letters"}
        {start 900      end 906     name "subset Greek"}
        {start 908      end 908     name "subset Greek"}
        {start 910      end 974     name "subset Greek"}
        {start 1024     end 1119    name "subset Cyrillic"}
        {start 1168     end 1169    name "subset Cyrillic"}
        {start 7808     end 7813    name "subset Latin Extended Additional"}
        {start 7922     end 7923    name "subset Latin Extended Additional"}
        {start 8211     end 8213    name "subset General Punctuation"}
        {start 8215     end 8222    name "subset General Punctuation"}
        {start 8224     end 8226    name "subset General Punctuation"}
        {start 8230     end 8230    name "subset General Punctuation"}
        {start 8240     end 8240    name "subset General Punctuation"}
        {start 8242     end 8243    name "subset General Punctuation"}
        }] description "Microsoft WGL4 Repertoire" settype "other"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    #The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent
    #we will fill this here for completeness - but with placeholders
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"]
    for {set i 0} {$i < 256} {incr i} {
        tcl::dict::set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"]
    }

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # Unicode ranges
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"]
    tcl::dict::set charinfo 9600 [list desc "Upper Half Block"                                    short "blocke_up_half"]
    tcl::dict::set charinfo 9601 [list desc "Lower One Eighth Block"                              short "blocke_lw_1_8th"]
    tcl::dict::set charinfo 9602 [list desc "Lower One Quarter Block"                             short "blocke_lw_1_qtr"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"]
    tcl::dict::set charinfo 9984 [list desc "Black Safety Scissors"                               short "dingbats_black_safety_scissors"]
    #...
    tcl::dict::set charinfo 10175 [list desc "Double Curly Loop"                                  short "dingbats_double_curly_loop"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    #variation selectors 0xFe01 - 0xFE0F
    tcl::dict::set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"]
    tcl::dict::set charinfo 65024 [list desc "Variation Selector-1"                                                short "VS1"]
    tcl::dict::set charinfo 65025 [list desc "Variation Selector-2"                                                short "VS2"]
    tcl::dict::set charinfo 65026 [list desc "Variation Selector-3"                                                short "VS3"]
    tcl::dict::set charinfo 65027 [list desc "Variation Selector-4"                                                short "VS4"]
    tcl::dict::set charinfo 65027 [list desc "Variation Selector-5"                                                short "VS5"]
    tcl::dict::set charinfo 65029 [list desc "Variation Selector-6"                                                short "VS6"]
    tcl::dict::set charinfo 65030 [list desc "Variation Selector-7"                                                short "VS7"]
    tcl::dict::set charinfo 65031 [list desc "Variation Selector-8"                                                short "VS8"]
    tcl::dict::set charinfo 65032 [list desc "Variation Selector-9"                                                short "VS9"]
    tcl::dict::set charinfo 65033 [list desc "Variation Selector-10"                                               short "VS10"]
    tcl::dict::set charinfo 65034 [list desc "Variation Selector-11"                                               short "VS11"]
    tcl::dict::set charinfo 65035 [list desc "Variation Selector-12"                                               short "VS12"]
    tcl::dict::set charinfo 65036 [list desc "Variation Selector-13"                                               short "VS13"]
    tcl::dict::set charinfo 65037 [list desc "Variation Selector-14"                                               short "VS14"]
    tcl::dict::set charinfo 65038 [list desc "Variation Selector-15 text variation"                                short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc
    tcl::dict::set charinfo 65039 [list desc "Variation Selector-16 emoji variation"                               short "VS16"]


    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    # emoticons https://www.unicode.org/charts/PDF/U1F600.pdf
    tcl::dict::set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"]
    tcl::dict::set charinfo 128512 [list desc "Grinning Face"                                     short "emoticon_gface"]
    tcl::dict::set charinfo 128513 [list desc "Grinning Face with Smiling Eyes"                   short "emoticon_gface_smile_eyes"]
    tcl::dict::set charinfo 128514 [list desc "Face with Tears of Joy"                            short "emoticon_face_tears_joy"]

    #todo
    tcl::dict::set charinfo 128590 [list desc "Person with Pouting Face"                          short "emoticon_person_pout"]

    # -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
    tcl::dict::set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"]
    tcl::dict::set charinfo 9472 [list desc "Box Drawings Light Horizontal"                       short "boxd_lhz"]
    tcl::dict::set charinfo 9473 [list desc "Box Drawings Heavy Horizontal"                       short "boxd_hhz"]
    tcl::dict::set charinfo 9474 [list desc "Box Drawings Light Vertical"                         short "boxd_lv"]
    tcl::dict::set charinfo 9475 [list desc "Box Drawings Heavy Vertical"                         short "boxd_hv"]
    tcl::dict::set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal"           short "boxd_ltdshhz"]
    tcl::dict::set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal"           short "boxd_htdshhz"]
    tcl::dict::set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical"             short "boxd_ltdshv"]
    tcl::dict::set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical"             short "boxd_htdshv"]
    tcl::dict::set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal"        short "boxd_lqdshhz"]
    tcl::dict::set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal"        short "boxd_hqdshhz"]
    tcl::dict::set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical"          short "boxd_lqdshv"]
    tcl::dict::set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical"          short "boxd_hqdshv"]
    tcl::dict::set charinfo 9484 [list desc "Box Drawings Light Down and Right"                   short "boxd_ldr"]
    tcl::dict::set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy"             short "boxd_dlrh"]
    tcl::dict::set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light"             short "boxd_dhrl"]
    tcl::dict::set charinfo 9487 [list desc "Box Drawings Heavy Down and Right"                   short "boxd_hdr"]
    tcl::dict::set charinfo 9488 [list desc "Box Drawings Light Down and Left"                    short "boxd_ldl"]
    tcl::dict::set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy"              short "boxd_dllh"]
    tcl::dict::set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light"              short "boxd_dhll"]
    tcl::dict::set charinfo 9491 [list desc "Box Drawings Heavy Down and Left"                    short "boxd_hdl"]
    tcl::dict::set charinfo 9492 [list desc "Box Drawings Light Up and Right"                     short "boxd_lur"]
    tcl::dict::set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy"               short "boxd_ulrh"]
    tcl::dict::set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light"               short "boxd_uhrl"]
    tcl::dict::set charinfo 9495 [list desc "Box Drawings Heavy Up and Right"                     short "boxd_hur"]
    tcl::dict::set charinfo 9496 [list desc "Box Drawings Light Up and Left"                      short "boxd_lul"]
    tcl::dict::set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy"                short "boxd_ullh"]
    tcl::dict::set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light"                short "boxd_uhll"]
    tcl::dict::set charinfo 9499 [list desc "Box Drawings Heavy Up and Left"                      short "boxd_hul"]
    tcl::dict::set charinfo 9500 [list desc "Box Drawings Light Vertical and Right"               short "boxd_lvr"]
    tcl::dict::set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy"         short "boxd_vlrh"]
    tcl::dict::set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light"          short "boxd_uhrdl"]
    tcl::dict::set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light"          short "boxd_dhrul"]
    tcl::dict::set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light"         short "boxd_vhrl"]
    tcl::dict::set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy"          short "boxd_dlruh"]
    tcl::dict::set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy"          short "boxd_ulrdh"]
    tcl::dict::set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right"               short "boxd_hvr"]
    tcl::dict::set charinfo 9508 [list desc "Box Drawings Light Vertical and Left"                short "boxd_lvl"]
    tcl::dict::set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy"          short "boxd_vllh"]
    tcl::dict::set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light"            short "boxd_uhldl"]
    tcl::dict::set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light"           short "boxd_dhlul"]
    tcl::dict::set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light"          short "boxd_vhll"]
    tcl::dict::set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy"           short "boxd_dlluh"]
    tcl::dict::set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy"           short "boxd_ulldh"]
    tcl::dict::set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left"                short "boxd_hvl"]
    tcl::dict::set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal"              short "boxd_ldhz"]
    tcl::dict::set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light"        short "boxd_lhrdl"]
    tcl::dict::set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light"        short "boxd_rhldl"]
    tcl::dict::set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy"        short "boxd_dlhzh"]
    tcl::dict::set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light"        short "boxd_dhhzl"]
    tcl::dict::set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy"        short "boxd_rlldh"]
    tcl::dict::set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy"        short "boxd_llrdh"]
    tcl::dict::set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal"              short "boxd_hdhz"]
    tcl::dict::set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal"                short "boxd_luhz"]
    tcl::dict::set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light"          short "boxd_lhrul"]
    tcl::dict::set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light"          short "boxd_rhlul"]
    tcl::dict::set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy"          short "boxd_ulhzh"]
    tcl::dict::set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light"          short "boxd_uhhzl"]
    tcl::dict::set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy"          short "boxd_rlluh"]
    tcl::dict::set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy"          short "boxd_llruh"]
    tcl::dict::set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal"                short "boxd_huhz"]
    tcl::dict::set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal"          short "boxd_lvhz"]
    tcl::dict::set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light"    short "boxd_lhrvl"]
    tcl::dict::set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light"    short "boxd_rhlvl"]
    tcl::dict::set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy"    short "boxd_vlhzh"]
    tcl::dict::set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light"     short "boxd_uhdhzl"]
    tcl::dict::set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light"     short "boxd_dhuhzl"]
    tcl::dict::set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light"    short "boxd_vhhzl"]
    tcl::dict::set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light"     short "boxd_luhrdl"]
    tcl::dict::set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light"     short "boxd_ruhldl"]
    tcl::dict::set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light"     short "boxd_ldhrul"]
    tcl::dict::set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light"     short "boxd_rdhlul"]
    tcl::dict::set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy"     short "boxd_dluhzh"]
    tcl::dict::set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy"     short "boxd_dldhzh"]
    tcl::dict::set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy"    short "boxd_rllvh"]
    tcl::dict::set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy"    short "boxd_llrvh"]
    tcl::dict::set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal"          short "boxd_hvhz"]
    tcl::dict::set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal"           short "boxd_lddshhz"]
    tcl::dict::set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal"           short "boxd_hddshhz"]
    tcl::dict::set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical"             short "boxd_lddshv"]
    tcl::dict::set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical"             short "boxd_hddshv"]
    tcl::dict::set charinfo 9552 [list desc "Box Drawings Double Horizontal"                      short "boxd_dhz"]
    tcl::dict::set charinfo 9553 [list desc "Box Drawings Double Vertical"                        short "boxd_dv"]
    tcl::dict::set charinfo 9554 [list desc "Box Drawings Down Single and Right Double"           short "boxd_dsrd"]
    tcl::dict::set charinfo 9555 [list desc "Box Drawings Down Double and Right Single"           short "boxd_ddrs"]
    tcl::dict::set charinfo 9556 [list desc "Box Drawings Double Down and Right"                  short "boxd_ddr"]
    tcl::dict::set charinfo 9557 [list desc "Box Drawings Down Single and Left Double"            short "boxd_dsld"]
    tcl::dict::set charinfo 9558 [list desc "Box Drawings Down Double and Left Single"            short "boxd_ddls"]
    tcl::dict::set charinfo 9559 [list desc "Box Drawings Double Down and Left"                   short "boxd_ddl"]
    tcl::dict::set charinfo 9560 [list desc "Box Drawings Up Single and Right Double"             short "boxd_usrd"]
    tcl::dict::set charinfo 9561 [list desc "Box Drawings Up Double and Right Single"             short "boxd_udrs"]
    tcl::dict::set charinfo 9562 [list desc "Box Drawings Double Up and Right"                    short "boxd_dur"]
    tcl::dict::set charinfo 9563 [list desc "Box Drawings Up Single and Left Double"              short "boxd_usld"]
    tcl::dict::set charinfo 9564 [list desc "Box Drawings Up Double and Left Single"              short "boxd_udls"]
    tcl::dict::set charinfo 9565 [list desc "Box Drawings Double Up and Left"                     short "boxd_dul"]
    tcl::dict::set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double"       short "boxd_vsrd"]
    tcl::dict::set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single"       short "boxd_vdrs"]
    tcl::dict::set charinfo 9568 [list desc "Box Drawings Double Vertical and Right"              short "boxd_dvr"]
    tcl::dict::set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double"        short "boxd_vsld"]
    tcl::dict::set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single"        short "boxd_vdls"]
    tcl::dict::set charinfo 9571 [list desc "Box Drawings Double Vertical and Left"               short "boxd_dvl"]
    tcl::dict::set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double"      short "boxd_dshzd"]
    tcl::dict::set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single"      short "boxd_ddhzs"]
    tcl::dict::set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal"             short "boxd_ddhz"]
    tcl::dict::set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double"        short "boxd_ushzd"]
    tcl::dict::set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single"        short "boxd_udhzs"]
    tcl::dict::set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal"               short "boxd_duhz"]
    tcl::dict::set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double"  short "boxd_vshzd"]
    tcl::dict::set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single"  short "boxd_vdhzs"]
    tcl::dict::set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal"         short "boxd_dvhz"]
    tcl::dict::set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right"               short "boxd_ladr"]
    tcl::dict::set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left"                short "boxd_ladl"]
    tcl::dict::set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left"                  short "boxd_laul"]
    tcl::dict::set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right"                 short "boxd_laur"]
    tcl::dict::set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left"   short "boxd_ldgurll"]
    tcl::dict::set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right"   short "boxd_ldgullr"]
    tcl::dict::set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross"                   short "boxd_ldc"]
    tcl::dict::set charinfo 9588 [list desc "Box Drawings Light Left"                             short "boxd_ll"]
    tcl::dict::set charinfo 9589 [list desc "Box Drawings Light Up"                               short "boxd_lu"]
    tcl::dict::set charinfo 9590 [list desc "Box Drawings Light Right"                            short "boxd_lr"]
    tcl::dict::set charinfo 9591 [list desc "Box Drawings Light Down"                             short "boxd_ld"]
    tcl::dict::set charinfo 9592 [list desc "Box Drawings Heavy Left"                             short "boxd_hl"]
    tcl::dict::set charinfo 9593 [list desc "Box Drawings Heavy Up"                               short "boxd_hu"]
    tcl::dict::set charinfo 9594 [list desc "Box Drawings Heavy Right"                            short "boxd_hr"]
    tcl::dict::set charinfo 9595 [list desc "Box Drawings Heavy Down"                             short "boxd_hd"]
    tcl::dict::set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right"             short "boxd_llhr"]
    tcl::dict::set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down"                short "boxd_luhd"]
    tcl::dict::set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right"             short "boxd_hllr"]
    tcl::dict::set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down"                short "boxd_huld"]


    tcl::dict::set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"]
        tcl::dict::set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"]

    tcl::dict::set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"]

    tcl::dict::set charsets "noncharacters" [list ranges [list {*}{
        {start 64976 end 65007   note "BMP FDD0..FDEF"}
        {start 65534 end 65535   note "BMP FFFE,FFFF"}
        {start 131070 end 131071 note "plane1 1FFFE,1FFFF"}
        {start 196606 end 196607 note "plane2 2FFFE,2FFFF"}
        {start 262142 end 262143 note "plane3 3FFFE,3FFFF"}
        {start 327678 end 327679 note "plane4 4FFFE,4FFFF"}
        {start 393214 end 393215 note "plane5 5FFFE,5FFFF"}
        {start 458750 end 458751 note "plane6 6FFFE,6FFFF"}
        {start 524286 end 524287 note "plane7 7FFFE,7FFFF"}
        {start 589822 end 589823 note "plane8 8FFFE,8FFFF"}
        {start 655358 end 655359 note "plane9 9FFFE,9FFFF"}
        {start 720894 end 720895 note "plane10 AFFFE,AFFFF"}
        {start 786430 end 786431 note "plane11 BFFFE,BFFFF"}
        {start 851966 end 851967 note "plane12 CFFFE,CFFFF"}
        {start 917502 end 917503 note "plane13 DFFFE,DFFFF"}
        {start 983038 end 983039 note "plane14 EFFFE,EFFFF"}
        {start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}
        {start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}
        }] description "non-characters" settype "tcl_supplemental"
    ]

    #build dicts keyed on short
    variable charshort
    proc _build_charshort {} {
        variable charshort
        set charshort [tcl::dict::create]
        variable charinfo
        tcl::dict::for {k v} $charinfo {
            if {[tcl::dict::exists $v short]} {
                set sh [tcl::dict::get $v short]
                if {[tcl::dict::exists $charshort $sh]} {
                    puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'"
                }
                tcl::dict::set charshort $sh  [format %c $k]
            }
        }
        return [tcl::dict::size $charshort]
    }
    _build_charshort

    variable charset_extents_startpoints ;#stores endpoints   associated with each startpoint - but named after key which is startpoint.
    variable charset_extents_endpoints   ;#stores startpoints assoicated with each endpoint   - but named after key which is endpoint.
    variable charset_extents_rangenames  ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
    #build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
    #Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
    #as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
    #startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
    #endpoints   - key is endpoint of a range,   value is list of startpoints one for each range ending at this endpoint-key
    proc _build_charset_extents {} {
        variable charsets
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        variable charset_extents_rangenames
        set charset_extents_startpoints     [tcl::dict::create]
        set charset_extents_endpoints       [tcl::dict::create]
        set charset_extents_rangenames      [tcl::dict::create]
        tcl::dict::for {setname setinfo} $charsets {
            set ranges [tcl::dict::get $setinfo ranges]
            if {[tcl::dict::get $setinfo settype] eq "block"} {
                #unicode block must have a single range
                #we consider a char a member of the block even if unassigned/reserved (as per unicode documentation)
                set start [tcl::dict::get [lindex $ranges 0] start]
                set end [tcl::dict::get [lindex $ranges 0] end]
                if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
                    #assertion if end wasn't in startpoits list - then start won't be in endpoints list
                    tcl::dict::lappend charset_extents_startpoints  $start $end
                    tcl::dict::lappend charset_extents_endpoints $end $start
                }
                tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1]
            } else {
                #multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range.
                #They should be in order within a set - but we don't assume so
                set r 1
                foreach range $ranges {
                    set start [tcl::dict::get $range start]
                    set end [tcl::dict::get $range end]
                    if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
                        #assertion if end wasn't in startpoits list - then start won't be in endpoints list
                        tcl::dict::lappend charset_extents_startpoints  $start $end
                        tcl::dict::lappend charset_extents_endpoints    $end   $start
                    }
                    tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname $r]
                    incr r
                }
            }
        }
        #maintain in sorted order
        #-stride is available in lsort even at tcl8.6 - but not in lsearch
        set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints]
        set charset_extents_endpoints   [lsort -stride 2 -integer $charset_extents_endpoints]
        #no need to sort charset_extents_rangenames - lookup only done using dict methods
        return [tcl::dict::size $charset_extents_startpoints]
    }
    _build_charset_extents ;#rebuilds for all charsets

    #nerdfonts are within the Private use E000 - F8FF range
    proc load_nerdfonts {} {
        variable charsets
        variable charinfo
        package require fileutil
        set ver [package provide punk::char]
        if {$ver ne ""} {
            set ifneeded [package ifneeded punk::char [package provide punk::char]]
            #puts stderr "punk::char ifneeded script: $ifneeded"
            lassign [split $ifneeded ";"] _  sourceinfo
            set basedir  [file dirname [lindex $sourceinfo end]]
        } else {
            #review - will only work at package load time
            set scr [info script]
            if {$scr eq ""} {
                error "load_nerdfonts unable to determine package folder"
            }
            set basedir [file dirname [info script]]
        }
        set pkg_data_dir [file join $basedir char]
        set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
        if {[file exists $fname]} {
            #puts stderr "load_nerdfonts loading $fname"
            set data [fileutil::cat -translation binary $fname]
            set short_seen [tcl::dict::create]
            set current_set_range [tcl::dict::create]
            set filesets_loading [list]
            foreach ln [split $data \n] {
                    set ln [tcl::string::trim $ln]
                    if {$ln eq ""} {continue}
                    set desc [lassign $ln hex rawsetname]
                    set hexnum 0x$hex
                    set dec [expr $hexnum]
                    set setname "nf_$rawsetname"  ;#Ensure nerdfont set names are prefixed.

                    if {$setname ni $filesets_loading} {
                        if {![tcl::dict::exists $charsets $setname]} {
                            #set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first
                            dict unset charset $setname
                        }
                        set newrange [list start $dec end $dec]
                        tcl::dict::set current_set_range $setname $newrange
                        tcl::dict::set charsets $setname [list ranges [list $newrange]  description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]

                        lappend filesets_loading $setname
                    }
                    #expects ordered glyph list
                    set existing_range [tcl::dict::get $current_set_range $setname]
                    set existing_end [tcl::dict::get $existing_range end]
                    if {$dec - $existing_end == 1} {
                        #part of current range
                        tcl::dict::set current_set_range $setname end $dec
                        #overwrite last ranges element
                        set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1]
                        lappend rangelist [tcl::dict::get $current_set_range $setname]
                        tcl::dict::set charsets $setname ranges $rangelist
                    } else {
                        #new range for set
                        tcl::dict::set current_set_range $setname start $dec
                        tcl::dict::set current_set_range $setname end $dec
                        set rangelist [tcl::dict::get $charsets $setname ranges]
                        lappend rangelist [tcl::dict::get $current_set_range $setname]
                        tcl::dict::set charsets $setname ranges $rangelist
                    }

                    if {![tcl::dict::exists $charinfo $dec]} {
                        # -- ---
                        #review
                        set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom]
                        lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs]
                        lappend map {*}[list multimedia mm multiple multi outline outl language lang]
                        lappend map {*}[list odnoklassniki okru]
                        # -- ---
                        #consider other ways to unambiguously shorten names?
                        #normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position)
                        set normdesc [list]
                        foreach el $desc {
                            if {$el eq "o"} {
                                set el "outl"
                            }
                            lappend normdesc $el
                        }
                        set joined_desc [join $normdesc _]
                        #map after join so we can normalize some underscored elements e.g creativecommons & creative_commons
                        set mapped_desc [tcl::string::map $map $joined_desc]
                        set s nf_${rawsetname}_$mapped_desc

                        if {![tcl::dict::exists $short_seen $s]} {
                            tcl::dict::set short_seen $s {}
                        } else {
                            #duplicate in the data file (e.g 2023 weather night alt rain mix)
                            set s ${s}_$hex
                        }
                        tcl::dict::set charinfo $dec [list desc "$desc" short $s]
                    }
            }
           _build_charshort
           _build_charset_extents
        } else {
            puts stderr "unable to find glyph file. Tried $fname"
        }
    }

    proc package_base {} {
        #assume punk::char is in .tm form and we can use the package provide statement to determine base location
        #review
        set pkgver [package present punk::char]
        set pkginfo [package ifneeded punk::char $pkgver]
        set tmfile [lindex $pkginfo end]
        set pkg_base  [file dirname $tmfile]
        return $pkg_base
    }
    tcl::namespace::eval internal {
        proc unicode_folder {} {
            set parent [file join [punk::char::package_base] char]
            set candidates [glob -nocomplain -type d -dir $parent -tail unicode*]
            set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared??
            if {![llength $candidates]} {
                error "Failed to find unicode data folder in folder '$parent'"
            }
            set folder [file join $parent [lindex $candidates end]]
            return $folder
        }
        proc dict_getdef {dictValue args} {
            if {[llength $args] < 2} {
                error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"}
            }
            set keys [lrange $args 0 end-1]
            if {[tcl::dict::exists $dictValue {*}$keys]} {
                return [tcl::dict::get $dictValue {*}$keys]
            } else {
                return [lindex $args end]
            }
        }
    }


    #charsets structure
    #tcl::dict::set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"]

    #unicode Blocks.txt
    #load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap.
    #We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis.
    #unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block.
    #They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved
    proc load_unicode_blocks {} {
        #sample data line
        #0000..007F; Basic Latin
        variable charsets
        set file [file join [internal::unicode_folder] Blocks.txt]
        if {![file exists $file]} {
            error "Unicode Blocks.txt file not found at path '$file'"
        }
        puts "ok.. loading"
        set fd [open $file r]
        chan configure $fd -translation binary
        set data [read $fd]
        close $fd
        set block_count 0
        foreach ln [split $data \n] {
            set ln [tcl::string::trim $ln]
            if {[tcl::string::match #* $ln]} {
                continue
            }
            if {[set pcolon [tcl::string::first ";" $ln]] > 0} {
                set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]]
                set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]]
                set lhsparts [split $lhs .]
                set start [lindex $lhsparts 0]
                set end   [lindex $lhsparts end]
                #puts "$start -> $end  '$name'"
                set  decimal_start [expr {"0x$start"}]
                set  decimal_end   [expr {"0x$end"}]
                tcl::dict::set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block]
                incr block_count
            }
        }
        _build_charset_extents
        return $block_count
    }

    #unicode scripts

    #unicode UnicodeData.txt


    #https://www.unicode.org/reports/tr44/#Property_Values


    #unicode EastAsianWidth.txt
    #classify width of character - which is contextual in some cases
    #####
    #Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world.
    #We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts.
    ####
    # -- ---
    #A = Ambiguous - All characters that can be sometimes wide and sometimes narrow.  (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
    #F = East Asian Full-width
    #H = East Asian Half-width
    #N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
    #Na = East Asian Narrow  - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
    #W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
    # -- ---


    proc charshort {shortname} {
        variable charshort
        return [tcl::dict::get $charshort $shortname]
    }

    proc box_drawing {args} {
        return [charset "Box Drawing" {*}$args]
    }
    proc box_drawing_dict {} {
        return [charset_dict "Box Drawing"]
    }

    proc char_hex {char} {
        return  [format %08x [scan $char %c]]
    }

    proc char_info_hex {hex args} {
        set hex [tcl::string::map [list _ ""] $hex]
        if {[tcl::string::is xdigit -strict $hex]} {
            #has no leading 0x
            set dec [expr {"0x$hex"}]
        } else {
            set dec [expr {$hex}]
        }
        return [char_info_dec $dec {*}$args]
    }
    proc char_info {char args} {
        #Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special)
        #there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD
        #we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict
        set returninfo [tcl::dict::create]
        if {[tcl::string::equal \UFFFD $char] && [tcl::string::equal \U1F600 \UFFFD]} {
            tcl::dict::set returninfo WARNING "this tcl maps multiple to FFFD"
        }
        lassign [scan $char %c%s] dec_char remainder
        if {[tcl::string::length $remainder]} {
            error "char_info requires a single character"
        }
        set result [tcl::dict::merge $returninfo [char_info_dec $dec_char {*}$args]]
    }
    proc char_info_dec {dec args} {
        set dec_char [expr {$dec}]
        set opts [tcl::dict::create {*}{
            -fields {default}
            -except {}
        }]
        #testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow
        set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short'
        #todo - unicode properties
        # tclwhitespace  (different to unicode concept of whitespace. review )

        foreach {k v} $args {
            switch -- $k {
                -fields - -except {
                    tcl::dict::set opts $k $v
                }
                default {
                    error "char_info unrecognised option '$k'. Known options:'[tcl::dict::keys $opts]' known_fields: $known_fields usage: char_info <char> ?-fields {<fieldnames>}? ?-except {<fieldnames>}?"
                }
            }
        }
        # -- --- --- --- --- --- --- --- --- --- --- ---
        set opt_fields      [tcl::dict::get $opts -fields]
        set opt_except      [tcl::dict::get $opts -except]
        # -- --- --- --- --- --- --- --- --- --- --- ---
        set initial_fields [list]
        if {"default" in $opt_fields} {
            set initial_fields $known_fields
            if {"testwidth" ni $opt_fields} {
                if {"testwidth" ni $opt_except} {
                    lappend opt_except testwidth
                }
            }
            if {"char" ni $opt_fields} {
                if {"char" ni $opt_except} {
                    lappend opt_except char
                }
            }
        } elseif {"all" in $opt_fields} {
            set initial_fields $known_fields
        } else {
            foreach f $opt_fields {
                if {$f in $known_fields} {
                    lappend initial_fields $f
                } else {
                    error "char_info unknown field name: '$f' known fields: '$known_fields'"
                }
            }
        }
        foreach e $opt_except {
            if {$e ni $known_fields} {
                error "char_info unknown field name $e in -except. known fields: '$known_fields'"
            }
        }
        set fields [list]
        foreach f $initial_fields {
            if {$f ne "all" && $f ni $opt_except} {
                lappend fields $f
            }
        }
        if {![llength $fields]} {
            return
        }

        variable charinfo
        variable charsets
        set hex_char [format %04x $dec_char]
        set returninfo [tcl::dict::create]
        foreach f $fields {
            switch -- $f {
                dec {
                    tcl::dict::set returninfo dec $dec_char
                }
                hex {
                    tcl::dict::set returninfo hex $hex_char
                }
                desc {
                    if {[tcl::dict::exists $charinfo $dec_char desc]} {
                        tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char desc]
                    } else {
                        tcl::dict::set returninfo desc ""
                    }
                }
                short {
                    if {[tcl::dict::exists $charinfo $dec_char short]} {
                        tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char short]
                    } else {
                        tcl::dict::set returninfo short ""
                    }
                }
                testwidth {
                    #todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables
                    #testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
                    set existing_testwidth ""
                    if {[tcl::dict::exists $charinfo $dec_char testwidth]} {
                    set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth]
                    }
                    if {$existing_testwidth eq ""} {
                        #no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
                        set char [format %c $dec_char]
                        set chwidth [char_info_testwidth $char]

                        tcl::dict::set returninfo testwidth $chwidth
                        #cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
                        tcl::dict::set charinfo $dec_char testwidth $chwidth
                    } else {
                        tcl::dict::set returninfo testwidth $existing_testwidth
                    }
                }
                char {
                    set char [format %c $dec_char]
                    tcl::dict::set returninfo char $char
                }
                memberof {
                    #memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
                    #note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
                    #This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
                    #We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
                    #some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
                    set memberof [list]
                    tcl::dict::for {setname setinfo} $charsets {
                        foreach r [tcl::dict::get $setinfo ranges] {
                            set s [tcl::dict::get $r start]
                            set e [tcl::dict::get $r end]
                            if {$dec_char >= $s && $dec_char <= $e} {
                                lappend memberof $setname
                                break
                            }
                        }
                    }
                    tcl::dict::set returninfo memberof $memberof
                }
            }
        }

        return $returninfo
    }

    proc _char_info_dec_memberof_scan {dec} {
        variable charsets
        set memberof [list]
        tcl::dict::for {setname setinfo} $charsets {
            foreach r [tcl::dict::get $setinfo ranges] {
                set s [tcl::dict::get $r start]
                set e [tcl::dict::get $r end]
                if {$dec >= $s && $dec <= $e} {
                    lappend memberof $setname
                    break
                }
            }
        }
        return $memberof
    }
    proc range_split_info {dec} {
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        set skeys [tcl::dict::keys $charset_extents_startpoints]
        set ekeys [tcl::dict::keys $charset_extents_endpoints]
        set splen [tcl::dict::size $charset_extents_startpoints]
        set eplen [tcl::dict::size $charset_extents_endpoints]
        set s [lsearch -bisect -integer $skeys $dec]
        set s_at_or_below [lrange $skeys 0 $s]
        set e_of_s [list]
        foreach sk $s_at_or_below {
            lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk]
        }
        set e_of_s [lsort -integer $e_of_s]
        set splitposn [lsearch -bisect -integer $e_of_s $dec]
        if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn}
        #set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}]
        set reduced_endpoints [lrange $e_of_s $splitposn end]
        set sps [list]
        foreach ep $reduced_endpoints {
            lappend sps {*}[tcl::dict::get $charset_extents_endpoints $ep]
        }


        set e [lsearch -bisect -integer $ekeys $dec]
        if {$e >= 0} {
            set e_at_or_above [lrange $ekeys $e end]
            set s_of_e [list]
            foreach  ek $e_at_or_above {
                lappend s_of_e {*}[tcl::dict::get $charset_extents_endpoints $ek]
            }
            set startpoints_of_above [llength $s_of_e]
            set splitposn [lsearch -bisect -integer $s_of_e $dec]
            set reduced_startpoints [lrange $s_of_e 0 $splitposn]
            set eps [list]
            foreach sp $reduced_startpoints {
                lappend eps {*}[tcl::dict::get $charset_extents_startpoints $sp]
            }
        } else {
            set s_of_e [list]
            set reduced_startpoints [list]
            set eps [list]

        }

        return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
    }
    #for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
    #performance biased towards lower numbered characters (which is not too bad in the context of unicode)
    #todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
    #review with scripts loaded and more defined ranges..
    #This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
    #Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
    #review - compare with 'interval tree' algorithms.
    proc char_info_dec_memberof {dec} {
        variable charset_extents_startpoints
        variable charset_extents_endpoints
        variable charset_extents_rangenames
        if {[package vcompare [info tclversion] 8.7a5] >= 0} {
            #algorithm should theoretically be a little better with -stride
            set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec]
            set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair
            set endpoints_of_starting_below [lsort -integer [concat {*}[tcl::dict::values $sets_starting_below]]]
        } else {
            #no -stride available
            set startkeys [tcl::dict::keys $charset_extents_startpoints]
            set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all)
            #set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn]
            set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec
            #puts "start_below_keys: '$start_below_keys'"
            set endpoints_of_starting_below [list]
            foreach belowkey $start_below_keys {
                lappend endpoints_of_starting_below {*}[tcl::dict::get $charset_extents_startpoints $belowkey]
            }
            #set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]]
            set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below]
        }

        set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn
        if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn}
        set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end]
        ################
        #note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character )
        #  x1         x2
        #        y1   y2
        #     c
        ################
        #we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint
        set ranges [list]
        foreach ep $reduced_opposite_limit {
            foreach s [tcl::dict::get $charset_extents_endpoints $ep] {
                if {$s <= $dec} {
                    lappend ranges [tcl::dict::get $charset_extents_rangenames $s,$ep]
                }
            }
        }
        return $ranges

    }


    #with glob searching of description and short
    proc char_range_dict {start end args} {
        if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
            error "char_range_dict error start and end must be integers"
        }
        set and_globs [list]
        if {![llength $args]} {
            set args [list *]
        }
        foreach glob $args {
            if {![regexp {[*?]} $glob]} {
                lappend and_globs "*$glob*"
            } else {
                lappend and_globs $glob
            }
        }
        variable charinfo
        set cdict [tcl::dict::create]
        set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr
        for {set i $start} {$i <= $end} {incr i} {
            set hx [format %04x $i]
            set ch [format %c $i]
            if {[tcl::dict::exists $charinfo $i desc]} {
                set d [tcl::dict::get $charinfo $i desc]
            } else {
                set d ""
            }
            if {[tcl::dict::exists $charinfo $i short]} {
                set s [tcl::dict::get $charinfo $i short]
            } else {
                set s ""
            }
            set matchcount 0
            foreach glob $and_globs {
                if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} {
                   incr matchcount
                }
            }
            if {$matchcount == [llength $and_globs]} {
                if {[tcl::dict::exists $charinfo $i]} {
                    tcl::dict::set cdict $hx [tcl::dict::merge [tcl::dict::create dec $i hex $hx char $ch] [tcl::dict::get $charinfo $i]]
                } else {
                    tcl::dict::set cdict $hx [list dec $i hex $hx char $ch desc $d short $s]
                }
            }
        }
        return $cdict
    }
    #with glob searches of desc and short
    proc char_range {start end args} {
        package require overtype
        if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
            error "char_range error start and end must be integers"
        }
        set charset_dict [char_range_dict $start $end {*}$args]
        set out ""
        set col3 [tcl::string::repeat " " 12]
        tcl::dict::for {k inf} $charset_dict {
            set s [internal::dict_getdef $inf short ""]
            set d [internal::dict_getdef $inf desc ""]
            set s_col [overtype::left $col3 $s]
            append out "$k [tcl::dict::get $inf dec] [tcl::dict::get $inf char] $s_col $d" \n
        }
        return $out
    }


    #non-overlapping unicode blocks
    proc char_blocks {{name_or_glob *}} {
        variable charsets
        #todo - more efficient datastructures?
        if {![regexp {[?*]} $name_or_glob]} {
            #no glob - just retrieve it
            if {[tcl::dict::exists $charsets $name_or_glob]} {
                if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} {
                    return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]]
                }
            }
            #no exact match - try case insensitive..
            set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
            if {$name ne ""} {
                if {[tcl::dict::get $charsets $name settype] eq "block"} {
                    return [tcl::dict::create $name [tcl::dict::get $charsets $name]]
                }
            }

        } else {
            #build a subset
            set charsets_block [tcl::dict::create]
            tcl::dict::for {k v} $charsets {
                if {[tcl::string::match -nocase $name_or_glob $k]} {
                    if {[tcl::dict::get $v settype] eq "block"} {
                        tcl::dict::set charsets_block $k $v
                    }
                }
            }
            return $charsets_block
        }
    }
    proc charset_names {{name_or_glob *}} {
        variable charsets
        if {![regexp {[?*]} $name_or_glob]} {
            #no glob - just retrieve it
            if {[tcl::dict::exists $charsets $name_or_glob]} {
                return [list $name_or_glob]
            }
            #no exact match - try case insensitive..
            set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
            if {$name ne ""} {
                return [list $name]
            }
        } else {
            if {$name_or_glob eq "*"} {
                return [lsort [tcl::dict::keys $charsets]]
            }
            #tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches
            return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]]
        }
    }

    #deprecated
    #major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4
    #case insensitive search - possibly with *basic* globs (? and * only - not lb rb)
    proc charset_names2 {{namesearch *}} {
        variable charsets
        #dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
        #set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
        set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]]
        if {$namesearch eq "*"} {
            return $sortedkeys
        }
        if {[regexp {[?*]} $namesearch]} {
            #name glob search
            return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
        } else {
            #return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs  - bails out earlier if -sorted?
            return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
        }
    }
    proc charsets {{namesearch *}} {
        package require textblock
        variable charsets
        set charset_names [charset_names $namesearch]
        set settype_list [list]
        foreach setname $charset_names {
            lappend settype_list [tcl::dict::get $charsets $setname settype]
        }

        #set charset_names [linsert $charset_names 0 "Set Name"]
        ledit charset_names 0 -1 "Set Name"
        #set settype_list  [linsert $settype_list 0 "Set Type"]
        ledit settype_list 0 -1 "Set Type"

        return [textblock::join -- [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]]
    }
    proc charset_defget {exactname} {
        variable charsets
        return [tcl::dict::get $charsets $exactname]
    }
    proc charset_defs {charsetname} {
        variable charsets
        set matches [charset_names $charsetname]
        set def_list [list]
        foreach setname $matches {
            lappend def_list [tcl::dict::create $setname [tcl::dict::get $charsets $setname]]
        }
        return [join $def_list \n]
    }
    proc charset_dictget {exactname} {
        variable charsets
        set setinfo [tcl::dict::get $charsets $exactname]
        set ranges [tcl::dict::get $setinfo ranges]
        set charset_dict [tcl::dict::create]
        foreach r $ranges {
            set start [tcl::dict::get $r start]
            set end   [tcl::dict::get $r end]
            set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
        }
        return $charset_dict
    }
    proc charset_dicts {searchname} {
        variable  charsets
        set matches [charset_names $searchname]
        if {![llength $matches]} {
            error "No charset found matching name '$searchname' - use 'charset_names' to get list"
        }
        set dict_list [list]
        foreach m $matches {
            lappend dict_list [tcl::dict::create $m [charset_dictget $m]]
        }
        #return $dict_list
        return [join $dict_list \n]
    }
    proc charset_page {namesearch args} {
        _charset_page_search $namesearch $args ;#pass args to descsearch argument
    }
    proc _charset_page_search {namesearch search_this_and_that args} {
        variable  charsets
        variable charinfo
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set defaults [tcl::dict::create {*}{
            -ansi   0
            -lined  1
        }]
        set opts [tcl::dict::merge $defaults $args]
        # -- --- --- ---
        set opt_ansi        [tcl::dict::get $opts -ansi]
        set opt_lined       [tcl::dict::get $opts -lined]
        # -- --- --- ---
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}

        if {$opt_ansi} {
            set a1 [a BLACK white bold]
            set a2 [a]
        } else {
            set a1 ""
            set a2 ""
        }
        set cols 16
        set prefix "    "
        append out $prefix
        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [tcl::dict::get $charsets $charsetname]
                set ranges [tcl::dict::get $setinfo ranges]
                set charset_dict [tcl::dict::create]
                foreach r $ranges {
                    set start [tcl::dict::get $r start]
                    set end   [tcl::dict::get $r end]
                    set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }
            if {![tcl::dict::size $charset_dict]} {
                continue
            }
            set i 1
            append out \n $prefix $charsetname
            append out \n

            set marker_line $prefix
            set line     $prefix
            tcl::dict::for {hex inf} $charset_dict {
                set ch [tcl::dict::get $inf char]
                set twidth ""
                set dec [expr {"0x$hex"}]
                if {[tcl::dict::exists $charinfo $dec testwidth]} {
                    set twidth [tcl::dict::get $charinfo $dec testwidth]
                }
                if {$twidth eq ""} {
                    #set width [ansifreestring_width $ch] ;#based on unicode props
                    set width [grapheme_width_cached $ch]
                } else {
                    set width $twidth
                }
                if {$width == 0} {
                    set marker "   "
                    if {[regexp $re_diacritics $ch]} {
                        #attempt to combine with space to get 3-wide displayv with diacritic showing at left space
                        #todo - dualchar diacritics?
                        set displayv " $ch  "
                    } else {
                        set displayv "   "
                    }
                } elseif {$width == 1} {
                    set marker "_  "
                    set displayv "${a1}$ch${a2}  "
                } else {
                    #presumed 2
                    set marker "__ "
                    set displayv "${a1}$ch${a2} "
                }
                set hexlen [tcl::string::length $hex]
                append marker_line "[tcl::string::repeat " " $hexlen] $marker"
                append line "$hex $displayv"
                if {$i == [tcl::dict::size $charset_dict] || $i % $cols == 0} {
                    if {$opt_lined} {
                        append out $marker_line \n
                    }
                    append out $line \n
                    set marker_line $prefix
                    set line $prefix
                    #set out [tcl::string::range $out 0 end-2]
                    #append out \n "        "
                }
                incr i
            }
        }
        set out [tcl::string::trimright $out " "]
        return $out
    }

    #allows search on both name and an anded list of globs to be applied to description & short
    proc charset {namesearch args} {
        package require overtype
        variable  charsets
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set search_this_and_that $args

        set out ""

        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [tcl::dict::get $charsets $charsetname]
                set ranges [tcl::dict::get $setinfo ranges]
                set charset_dict [tcl::dict::create]
                foreach r $ranges {
                    set start [tcl::dict::get $r start]
                    set end   [tcl::dict::get $r end]
                    set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }

            set col_items_short [list]
            set col_items_desc  [list]
            tcl::dict::for {k inf} $charset_dict {
                lappend col_items_desc  [internal::dict_getdef $inf desc ""]
                lappend col_items_short [internal::dict_getdef $inf short ""]
            }
            if {[llength $col_items_desc]} {
                set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {tcl::string::length $v}]]
                if {$widest3 == 0} {
                    set col3 " "
                } else {
                    set col3 [tcl::string::repeat " " $widest3]
                }
                tcl::dict::for {k inf} $charset_dict {
                    set s [internal::dict_getdef $inf short ""]
                    set d [internal::dict_getdef $inf desc ""]
                    set s_col [overtype::left $col3 $s]
                    append out "$k [tcl::dict::get $inf char] $s_col $d" \n
                }
            }
        }

        return $out
    }

    #use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
    proc charset_calibrate {namesearch args} {
        variable  charsets
        variable charinfo
        set matched_names [charset_names $namesearch]
        if {![llength $matched_names]} {
            error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
        }
        set search_this_and_that $args
        set charcount 0
        set width_results [tcl::dict::create]
        puts stdout "calibrating using terminal cursor movements.."
        foreach charsetname $matched_names {
            if {[llength $search_this_and_that]} {
                set setinfo [tcl::dict::get $charsets $charsetname]
                set ranges [tcl::dict::get $setinfo ranges]
                set charset_dict [tcl::dict::create]
                foreach r $ranges {
                    set start [tcl::dict::get $r start]
                    set end   [tcl::dict::get $r end]
                    set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
                }
            } else {
                set charset_dict [charset_dictget $charsetname]
            }
            if {![tcl::dict::size $charset_dict]} {
                continue
            }
            tcl::dict::for {hex inf} $charset_dict {
                set ch [format %c 0x$hex]
                set twidth ""
                set dec [expr {"0x$hex"}]
                if {[tcl::dict::exists $charinfo $dec testwidth]} {
                    set twidth [tcl::dict::get $charinfo $dec testwidth]
                }
                if {$twidth eq ""} {
                    #puts -nonewline stdout "." ;#this
                    set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
                    tcl::dict::set charinfo $dec testwidth $width
                } else {
                    set width $twidth
                }
                tcl::dict::incr width_results $width
                incr charcount
            }
        }
        puts stdout "\ncalibration done - results cached in charinfo dictionary"
        return [tcl::dict::create charcount $charcount widths $width_results]
    }

    #maint warning - also in overtype!
    #intended for single grapheme - but will work for multiple
    #cannot contain ansi or newlines
    #(a cache of ansifreestring_width calls - as these are quite regex heavy)
    #review - effective memory leak on longrunning programs if never cleared
    #tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner
    proc grapheme_width_cached {ch {key ""}} {
        variable grapheme_widths
        #if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok
        if {[tcl::dict::exists $grapheme_widths $key $ch]} {
            return [tcl::dict::get $grapheme_widths $key $ch]
        }
        set width [punk::char::ansifreestring_width $ch] ;#review - can we provide faster version if we know it's a single grapheme rather than a string? (grapheme is still a string as it may have combiners/diacritics)
        tcl::dict::set grapheme_widths $key $ch $width
        return $width
    }
    proc grapheme_width_cache_clear {key} {
        variable grapheme_widths
        if {$key eq "*} {
            set grapheme_widths [tcl::dict::create]
        } else {
            tcl::dict::unset grapheme_widths $key
        }
        return
    }
    #no char_width - use grapheme_width terminology to be clearer
    proc grapheme_width {char} {
        error "grapheme_width unimplemented - use ansifreestring_width"
    }

    #return N Na W etc from unicode data
    #review
    proc char_uc_width_prop {char} {
        error "char_uc_width unimplemented try textutil::wcswidth_type"
    }
    #todo - provide a grapheme_width function  that is optimised for speed
    proc string_width {text} {
        #burn approx 2uS (2024) checking for ansi codes - not just SGR
        if {[punk::ansi::ta::detect $text]} {
            puts stderr "string_width detected ANSI!"
        }
        if {[tcl::string::last \n $text] >= 0} {
            error "string_width accepts only a single line"
        }
        #tailcall ansifreestring_width $text
        ansifreestring_width $text
    }

    #todo - consider disallowing/erroring out when \r \n in string?
    # - tab/vtab?
    # - compare with wcswidth returning -1 for entire string containing such in python,perl
    proc wcswidth {string} {
        #faster than textutil::wcswidth (at least for string up to a few K in length)
        #..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
        #Tcl initial evaluation stack size is 2000 (? review)
        #REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!!
        set chunksize 2000
        set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
        set width 0
        set startidx 0
        set endidx [expr {$startidx + $chunksize -1}]
        for {set i 0} {$i < $chunks_required} {incr i} {
            set chunk [tcl::string::range $string $startidx $endidx]

            set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
            foreach c $codes {
                if {$c <= 255} {
                    if {$c == 9 || ($c >= 31 && $c != 127)} {
                        #review - non-printing ascii? why does textutil::wcswidth report 1 ??
                        #todo - compare with python or other lang wcwidth
                        incr width
                    }
                } elseif {$c < 917504 || $c > 917631} {
                    #TODO - various other joiners and non-printing chars
                    set w [textutil::wcswidth_char $c]
                    if {$w < 0} {
                        return -1
                    } else {
                        incr width $w
                    }
                }
            }
            incr startidx   $chunksize
            incr endidx     $chunksize
        }
        return $width
    }


    proc wcswidth_clustered {string} {
        package require tk
        set width 0
        set i 0
        if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
            return [punk::char::wcswidth_unclustered $string] ;#still use our wcswidth to account for non-printable ascii
        }
        while {$i < [tcl::string::length $string]} {
            set aftercluster [tk::endOfCluster $string $i]
            set g [string range $string $i $aftercluster-1]
            if {$aftercluster > ($i + 1)} {
                #review - proper way to determine screen width (columns occupied) of a cluster??
                #according to this:
                #https://lib.rs/crates/unicode-display-width
                #for each grapheme - if any of the code points in the cluster have an east asian width of 2,
                #The entire grapheme width is 2 regardless of how many code points constitute the grapheme
                set gw 1
                foreach ch [split $g ""] {
                    if {[punk::char::wcswidth_single $ch] == 2} {
                        set gw 2
                        break
                    }
                }
                incr width $gw

                #if {[string first \u200d $g] >=0} {
                #    incr width 2
                #} else {
                #    #other joiners???
                #    incr width [wcswidth_unclustered $g]
                #}
            } else {
                incr width [wcswidth_unclustered $g]
            }
            set i $aftercluster
        }

        return $width
    }
    proc wcswidth_single {char} {
        scan $char %c dec
        if {$dec <= 255} {
            if {$dec == 9} {
                #tab always represented by at least one char in terminal etc.
                #caller will need to process tabs themselves to determine extra width applicable to their circumstance.
                return 1
            }
            if {($dec < 31 || $dec == 127)} {
                return 0
            }
            #review - non-printing ascii? why does textutil::wcswidth report 1 ??
            #todo - compare with python or other lang wcwidth
            return 1
        } elseif {$dec < 917504 || $dec > 917631} {
            #TODO - various other joiners and non-printing chars
            return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint!
            #may return -1 - REVIEW
        }
        return 0
    }
    proc wcswidth_unclustered1 {string} {
        set width 0
        foreach c [split $string {}] {
            scan $c %c dec
            if {$dec <= 255} {
                if {$dec == 9 || ($dec >= 31 && $dec != 127)} {
                    #review - non-printing ascii? why does textutil::wcswidth report 1 ??
                    #todo - compare with python or other lang wcwidth
                    incr width
                }
            } elseif {$dec < 917504 || $dec > 917631} {
                #TODO - various other joiners and non-printing chars
                set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
                if {$w < 0} {
                    return -1
                } else {
                    incr width $w
                }
            }
        }
        return $width
    }
    #todo - consider disallowing/erroring out when \r \n in string?
    # - tab/vtab?
    # - compare with wcswidth returning -1 for entire string containing such in python,perl
    proc wcswidth_unclustered {string} {
        #faster than textutil::wcswidth (at least for string up to a few K in length)
        #..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
        #Tcl initial evaluation stack size is 2000 (? review)
        #we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!.
        set chunksize 2000
        set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
        set width 0
        set startidx 0
        set endidx [expr {$startidx + $chunksize -1}]
        for {set i 0} {$i < $chunks_required} {incr i} {
            set chunk [tcl::string::range $string $startidx $endidx]

            set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
            foreach dec $codes {
                if {$dec <= 255} {
                    if {($dec ==9 || ($dec >= 31 && $dec != 127))} {
                        #review - non-printing ascii? why does textutil::wcswidth report 1 ??
                        #todo - compare with python or other lang wcswidth
                        incr width
                    }
                } elseif {$dec < 917504 || $dec > 917631} {
                    #TODO - various other joiners and non-printing chars
                    set w [textutil::wcswidth_char $dec]
                    if {$w < 0} {
                        return -1
                    } else {
                        incr width $w
                    }
                }
            }
            incr startidx   $chunksize
            incr endidx     $chunksize
        }
        return $width
    }
    # ------------------------------------------------------------------------------------------------------

    proc wcswidth0 {string} {
        #faster than textutil::wcswidth (at least for string up to a few K in length)
        #..but - 'scan' is horrible for 400K+
        #TODO
        set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
        set width 0
        foreach dec $codes {
            #unicode Tags block zero width
            if {$dec < 917504 || $dec > 917631} {
                if {$dec <= 255} {
                    #review - non-printing ascii? why does textutil::wcswidth report 1 ??
                    #todo - compare with python or other lang wcwidth
                    if {!($dec < 31 || $dec == 127)} {
                        incr width
                    }
                } else {
                    #TODO - various other joiners and non-printing chars
                    set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
                    if {$w < 0} {
                    return -1
                    } else {
                    incr width $w
                    }
                }
            }
        }
        return $width
    }
    proc wcswidth2 {string} {
        set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
        set widths [lmap dec $codes {textutil::wcswidth_char $dec}]
        if {-1 in $widths} {
            return -1
        }
        return [tcl::mathop::+ {*}$widths]
    }

    #prerequisites - no ansi escapes -  no newlines  - utf8 encoding assumed
    #review - what about \r \t \b ?
    #NO processing of \b - already handled in ansi::printing_length which then calls this
    #this version breaks string into sequences of ascii vs unicode
    proc ansifreestring_width {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[tcl::string::first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
        #}


        #todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.


        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        # -- --- --- --- --- --- ---
        #review
        #if we strip out ZWJ \u200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
        #as at 2024 - textutil::wcswidth just uses the unicode east-asian width property data and doesn't seem to handle these specially - it counts this joiner and others as one wide (also BOM \uFFEF)

        #TODO - once we have proper grapheme cluster splitting - work out which of these characters should be left in and/or when exactly their length-effects apply
        #
        #for now - strip them out

        #ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length
        #ZWSP \u200b zero width space

        #\uFFEFBOM/ ZWNBSP and others that should be zero width
        #todo - work out proper way to mark/group zero width.

        #set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text]
        set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text]
        #\uFFEF tends to print as 1 length replacement char - REVIEW
        #\uFFFF varies between terminals - some print replacement char (width 1) some print nothing (width 0)
        # -- --- --- --- --- --- ---

        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here

        #c0 controls + del (127 7f) - tab
        #set re_ascii_c0 {[\U0000-\U001F]}
        set re_ascii_c0 {[\u0000-\u0008\u000A-\u001F\u007F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective
        #some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all
        #we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here
        #they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function)
        set text [regsub -all {[\u0080-\u009f]+} $text ""]


        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\u100-\U10FFFF]} $text]} {
        #    return [tcl::string::length $text]
        #}
        if {![regexp "\[\u100-\U10FFFF\]" $text]} {
            return [tcl::string::length $text]
            #punk::char::wcswidth has to split and examine dec value of each code
            #By stripping controls + 7F (leaving tab) we've already eliminated the non-printable ascii - REVIEW
            #return [punk::char::wcswidth $text] ;#still use our wcswidth to account for non-printable ascii
        }

        #split just to get the standalone character widths - and then scan for other combiners (?) - or scan for clusters first?

        #review
        #set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
        #tcl pre 2023-11 - braced high unicode regexes don't work
        #fixed in bug-4ed788c618 2023-11
        #set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only

        #maintain unicode as sequences - todo - scan for grapheme clusters
        #set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
        set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0000-\u00FF\])+" $text]
        set len 0
        foreach {uc ascii} $uc_sequences {
            #puts "-ascii $ascii"
            #puts "-uc $uc"
            incr len [tcl::string::length $ascii]
            #textutil::wcswidth uses unicode data
            #fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
            #todo - find something that understands grapheme clusters - needed also for grapheme_split
            #use punk::char::wcswidth - faster than the string split in textutil::wcswidth but still uses textutil::wcswidth_char
            incr len [punk::char::wcswidth $uc]
        }
        #todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        return $len
    }

    #kept as a fallback for review/test if textutil::wcswidth doesn't do what we require on all terminals.
    #this version looks at console testwidths if they've been cached.
    #It is relatively fast - but tests unicode widths char by char - so won't be useful going forward for grapheme clusters.
    proc ansifreestring_width2 {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines


        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[tcl::string::first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
        #}


        #todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.


        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        #review
        #if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
        #as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide

        #ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length

        #ZWSP \u0200b zero width space


        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here
        set re_ascii_c0 {[\U0000-\U001F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\uFF-\U10FFFF]} $text]} {
        #    return [tcl::string::length $text]
        #}
        if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
            return [tcl::string::length $text]
        }

        #review - wcswidth should detect these
        set re_ascii_fullwidth {[\uFF01-\uFF5e]}

        set doublewidth_char_count 0
        set zerowidth_char_count 0
        #split just to get the standalone character widths - and then scan for other combiners (?)
        #review
        #set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
        #tcl pre 2023-11 - braced high unicode regexes don't work
        #fixed in bug-4ed788c618 2023-11
        #set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
        #set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
        set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
        foreach c $uc_chars {
            if {[regexp $re_ascii_fullwidth $c]} {
                incr doublewidth_char_count
            } else {
                #review
                # a)- terminals lie - so we could have a bad cached testwidth
                # b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
                #(character width is a complex context-dependent topic)
                # c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
                # d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
                #Despite all this - the mechanism is hoped to give best effort consistency for terminals
                #further work needed for combining emojis etc - which can't be done in a per character loop
                #todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
                # to process sequences of unicode.
                #- and the user has the option to test character sets first if terminal position reporting gives better results
                if {[char_info_is_testwidth_cached $c]} {
                    set width [char_info_testwidth_cached $c]
                } else {
                    #textutil::wcswidth uses unicode data
                    #fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
                    set width [textutil::wcswidth_char [scan $c %c]]
                }
                if {$width == 0} {
                    incr zerowidth_char_count
                } elseif {$width == 2} {
                    incr doublewidth_char_count
                }
            }
        }
        #todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        return [expr {[tcl::string::length $text] + $doublewidth_char_count - $zerowidth_char_count}]
    }

    #slow - textutil::wcswidth is slow with mixed ascii uc
    proc ansifreestring_width3 {text} {
        #caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines


        #we can c0 control characters after or while processing ansi escapes.
        #we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
        #anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
        #if {[tcl::string::first \033 $text] >= 0} {
        #    error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
        #}


        #review - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
        # - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
        #as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.

        #- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
        #- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
        #- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
        #- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
        #- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
        #
        # initial simplistic approach is just to strip these ... todo REVIEW

        #experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
        #(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
        #set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
        #if {[regexp $re_leading_diacritic $text]} {
        #    set text " $text"
        #}
        set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set text [regsub -all $re_diacritics $text ""]

        #we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
        #todo - document that these shouldn't be present in input rather than explicitly checking here
        set re_ascii_c0 {[\U0000-\U001F]}
        set text [regsub -all $re_ascii_c0 $text ""]

        #short-circuit basic cases
        #support tcl pre 2023-11 - see regexp bug below
        #if {![regexp {[\uFF-\U10FFFF]} $text]} {
        #    return [tcl::string::length $text]
        #}
        if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
            return [tcl::string::length $text]
        }

        #slow when ascii mixed with unicode (but why?)
        return [punk::char::wcswidth $text]
    }
    #This shouldn't be called on text containing ansi codes!
    proc strip_nonprinting_ascii {str} {
        #review - some single-byte 'control' chars have visual representations e.g ETX as heart depending on font/codepage
        #It is currently used for screen display width calculations
        #equivalent for various unicode combining chars etc?
        set map [list {*}{
            \x00 ""
            \x07 ""
            \x7f ""
        }]
        return [tcl::string::map $map $str]
    }

    #ISO-3166-1 alpha-2 country codes to flag emojis
    #These can change over time.
    #todo - lookup from official sources - and update as needed
    #
    variable flags [dict create {*}{
        AC \U1F1E6\U1F1E8
        AD \U1F1E6\U1F1E9
        AE \U1F1E6\U1F1EA
        AF \U1F1E6\U1F1EB
        AG \U1F1E6\U1F1EC
        AI \U1F1E6\U1F1EE
        AL \U1F1E6\U1F1F1
        AM \U1F1E6\U1F1F2
        AO \U1F1E6\U1F1F4
        AQ \U1F1E6\U1F1F6
        AR \U1F1E6\U1F1F7
        AS \U1F1E6\U1F1F8
        AT \U1F1E6\U1F1F9
        AU \U1F1E6\U1F1FA
        AW \U1F1E6\U1F1FC
        AX \U1F1E6\U1F1FD
        AZ \U1F1E6\U1F1FF
        BA \U1F1E7\U1F1E6
        BB \U1F1E7\U1F1E7
        BD \U1F1E7\U1F1E9
        BE \U1F1E7\U1F1EA
        BF \U1F1E7\U1F1EB
        BG \U1F1E7\U1F1EC
        BH \U1F1E7\U1F1ED
        BI \U1F1E7\U1F1EE
        BJ \U1F1E7\U1F1EF
        BL \U1F1E7\U1F1F1
        BM \U1F1E7\U1F1F2
        BN \U1F1E7\U1F1F3
        BO \U1F1E7\U1F1F4
        BQ \U1F1E7\U1F1F6
        BR \U1F1E7\U1F1F7
        BS \U1F1E7\U1F1F8
        BT \U1F1E7\U1F1F9
        BV \U1F1E7\U1F1FB
        BW \U1F1E7\U1F1FC
        BY \U1F1E7\U1F1FE
        BZ \U1F1E7\U1F1FF
        CA \U1F1E8\U1F1E6
        CC \U1F1E8\U1F1E8
        CD \U1F1E8\U1F1E9
        CF \U1F1E8\U1F1Eb
        CG \U1F1E8\U1F1Ec
        CH \U1F1E8\U1F1ED
        CI \U1F1E8\U1F1EE
        CK \U1F1E8\U1F1F0
        CL \U1F1E8\U1F1F1
        CM \U1F1E8\U1F1F2
        CN \U1F1E8\U1F1F3
        CO \U1F1E8\U1F1F4
        CP \U1F1E8\U1F1F5
        CR \U1F1E8\U1F1F7
        CS \U1F1E8\U1F1F8
        CU \U1F1E8\U1F1FA
        CV \U1F1E8\U1F1FB
        CW \U1F1E8\U1F1FC
        CX \U1F1E8\U1F1FE
        CY \U1F1E8\U1F1FE
        CZ \U1F1E8\U1F1FF
        DE \U1F1E9\U1F1EA
        DG \U1F1E9\U1F1EC
        DJ \U1F1E9\U1F1EF
        DK \U1F1E9\U1F1F0
        DM \U1F1E9\U1F1F2
        DO \U1F1E9\U1F1F4
        DZ \U1F1E9\U1F1FF
        EA \U1F1EA\U1F1E6
        EC \U1F1EA\U1F1E8
        EE \U1F1EA\U1F1EA
        EG \U1F1EA\U1F1EC
        EH \U1F1EA\U1F1ED
        ER \U1F1EA\U1F1F7
        ES \U1F1EA\U1F1F8
        ET \U1F1EA\U1F1F9
        EU \U1F1EA\U1F1FA
        FI \U1F1EB\U1F1EE
        FJ \U1F1EB\U1F1EF
        FK \U1F1EB\U1F1F0
        FM \U1F1EB\U1F1F2
        FO \U1F1EB\U1F1F4
        FR \U1F1EB\U1F1F7
        GA \U1F1EC\U1F1E6
        GB \U1F1EC\U1F1E7
        GD \U1F1EC\U1F1E9
        GE \U1F1EC\U1F1EA
        GF \U1F1EC\U1F1EB
        GG \U1F1EC\U1F1EC
        GH \U1F1EC\U1F1ED
        GI \U1F1EC\U1F1EE
        GL \U1F1EC\U1F1F1
        GM \U1F1EC\U1F1F2
        GN \U1F1EC\U1F1F3
        GP \U1F1EC\U1F1F5
        GQ \U1F1EC\U1F1F6
        GR \U1F1EC\U1F1F7
        GS \U1F1EC\U1F1F8
        GT \U1F1EC\U1F1F9
        GU \U1F1EC\U1F1FA
        GW \U1F1EC\U1F1FC
        GY \U1F1EC\U1F1FE
        HK \U1F1ED\U1F1F0
        HM \U1F1ED\U1F1F2
        HN \U1F1ED\U1F1F3
        HR \U1F1ED\U1F1F7
        HT \U1F1ED\U1F1F9
        HU \U1F1ED\U1F1FA
        IC \U1F1EE\U1F1E8
        ID \U1F1EE\U1F1E9
        IE \U1F1EE\U1F1EA
        IL \U1F1EE\U1F1F1
        IM \U1F1EE\U1F1F2
        IN \U1F1EE\U1F1F3
        IO \U1F1EE\U1F1F4
        IQ \U1F1EE\U1F1F6
        IR \U1F1EE\U1F1F7
        IS \U1F1EE\U1F1F8
        IT \U1F1EE\U1F1F9
        JE \U1F1EF\U1F1EA
        JM \U1F1EF\U1F1F2
        JO \U1F1EF\U1F1F4
        JP \U1F1EF\U1F1F5
        KE \U1F1F0\U1F1EA
        KG \U1F1F0\U1F1EC
        KH \U1F1F0\U1F1ED
        KI \U1F1F0\U1F1EE
        KM \U1F1F0\U1F1F2
        KN \U1F1F0\U1F1F3
        KP \U1F1F0\U1F1F5
        KR \U1F1F0\U1F1F7
        KW \U1F1F0\U1F1FC
        KY \U1F1F0\U1F1FE
        KZ \U1F1F0\U1F1FF
        LA \U1F1F1\U1F1E6
        LB \U1F1F1\U1F1E7
        LC \U1F1F1\U1F1E8
        LI \U1F1F1\U1F1EE
        LK \U1F1F1\U1F1F0
        LR \U1F1F1\U1F1F7
        LS \U1F1F1\U1F1F8
        LT \U1F1F1\U1F1F9
        LU \U1F1F1\U1F1FA
        LV \U1F1F1\U1F1FB
        LY \U1F1F1\U1F1FE
        MA \U1F1F2\U1F1E6
        MC \U1F1F2\U1F1E8
        MD \U1F1F2\U1F1E9
        ME \U1F1F2\U1F1EA
        MF \U1F1F2\U1F1EB
        MG \U1F1F2\U1F1EC
        MH \U1F1F2\U1F1ED
        MK \U1F1F2\U1F1F0
        ML \U1F1F2\U1F1F1
        MM \U1F1F2\U1F1F2
        MN \U1F1F2\U1F1F3
        MO \U1F1F2\U1F1F4
        MP \U1F1F2\U1F1F5
        MQ \U1F1F2\U1F1F6
        MR \U1F1F2\U1F1F7
        MS \U1F1F2\U1F1F8
        MT \U1F1F2\U1F1F9
        MU \U1F1F2\U1F1FA
        MV \U1F1F2\U1F1FB
        MW \U1F1F2\U1F1FC
        MX \U1F1F2\U1F1FD
        MY \U1F1F2\U1F1FE
        MZ \U1F1F2\U1F1FF
        NA \U1F1F3\U1F1E6
        NC \U1F1F3\U1F1E8
        NE \U1F1F3\U1F1EA
        NF \U1F1F3\U1F1EB
        NG \U1F1F3\U1F1EC
        NI \U1F1F3\U1F1EE
        NL \U1F1F3\U1F1F1
        NO \U1F1F3\U1F1F4
        NP \U1F1F3\U1F1F5
        NR \U1F1F3\U1F1F7
        NU \U1F1F3\U1F1F8
        NZ \U1F1F3\U1F1ff
        OM \U1F1F4\U1F1F2
        PA \U1F1F5\U1F1E6
        PE \U1F1F5\U1F1EA
        PF \U1F1F5\U1F1EB
        PG \U1F1F5\U1F1EC
        PH \U1F1F5\U1F1ED
        PK \U1F1F5\U1F1ED
        PL \U1F1F5\U1F1F1
        PM \U1F1F5\U1F1F3
        PN \U1F1F5\U1F1F3
        PR \U1F1F5\U1F1F7
        PS \U1F1F5\U1F1F8
        PT \U1F1F5\U1F1F9
        PW \U1F1F5\U1F1FC
        PY \U1F1F5\U1F1FE
        QA \U1F1F6\U1F1E6
        RE \U1F1F7\U1F1EA
        RO \U1F1F7\U1F1F4
        RS \U1F1F7\U1F1F7
        RU \U1F1F7\U1F1FA
        RW \U1F1F7\U1F1FC
        SA \U1F1F8\U1F1E6
        SB \U1F1F8\U1F1E7
        SC \U1F1F8\U1F1E8
        SD \U1F1F8\U1F1E9
        SE \U1F1F8\U1F1EA
        SG \U1F1F8\U1F1EC
        SH \U1F1F8\U1F1ED
        SI \U1F1F8\U1F1EE
        SJ \U1F1F8\U1F1EF
        SK \U1F1F8\U1F1F0
        SL \U1F1F8\U1F1F1
        SM \U1F1F8\U1F1F2
        SN \U1F1F8\U1F1F3
        SO \U1F1F8\U1F1F4
        SR \U1F1F8\U1F1F7
        SS \U1F1F8\U1F1F8
        ST \U1F1F8\U1F1F9
        SV \U1F1F8\U1F1F7
        SX \U1F1F8\U1F1F8
        SY \U1F1F8\U1F1FE
        SZ \U1F1F8\U1F1FF
        TA \U1F1F9\U1F1E6
        TC \U1F1F9\U1F1E8
        TD \U1F1F9\U1F1E9
        TF \U1F1F9\U1F1EB
        TG \U1F1F9\U1F1EC
        TH \U1F1F9\U1F1ED
        TJ \U1F1F9\U1F1EF
        TK \U1F1F9\U1F1F0
        TL \U1F1F9\U1F1F1
        TM \U1F1F9\U1F1F2
        TN \U1F1F9\U1F1F3
        TO \U1F1F9\U1F1F4
        TR \U1F1F9\U1F1F7
        TT \U1F1F9\U1F1F9
        TV \U1F1F9\U1F1FB
        TW \U1F1F9\U1F1FC
        TZ \U1F1F9\U1F1FF
        UA \U1F1FA\U1F1E6
        UG \U1F1FA\U1F1EC
        UM \U1F1FA\U1F1F2
        US \U1F1FA\U1F1F8
        UY \U1F1FA\U1F1FE
        UZ \U1F1FA\U1F1FF
        VA \U1F1FB\U1F1E6
        VC \U1F1FB\U1F1E8
        VE \U1F1FB\U1F1EA
        VG \U1F1FB\U1F1EC
        VI \U1F1FB\U1F1EE
        VN \U1F1FB\U1F1F3
        VU \U1F1FB\U1F1FA
        WF \U1F1FC\U1F1EB
        WS \U1F1FC\U1F1F8
        XK \U1F1FD\U1F1F3
        YE \U1F1FE\U1F1EA
        YT \U1F1FE\U1F1F9
        ZA \U1F1FF\U1F1E6
        ZM \U1F1FF\U1F1F2
        ZW \U1F1FF\U1F1FC
    }]
    variable rflags
    dict for {k v} $flags {
        dict set rflags $v $k
    }


    proc flag_from_ascii {code} {
        variable flags
        if {[regexp {^[A-Z]{2}$} $code]} {
            if {[dict exists $flags $code]} {
                return [dict get $flags $code]
            } else {
                error "unsupported flags code: $code"
            }
        } else {
             #try as subregion
             #e.g gbeng,gbwls,gbsct
             return \U1f3f4[tag_from_ascii $code]\Ue007f
        }
    }
    proc flag_to_ascii {charsequence} {
        variable rflags
        if {[dict exists $rflags $charsequence]} {
            return [dict get $rflags $charsequence]
        }
        if {[string index $charsequence 0] eq "\U1F3F4" && [string index $charsequence end] eq "\UE007F"} {
            #subdivision flag
            set tag [string range $charsequence 1 end-1]
            return [tag_to_ascii $tag]
        }
        error "unknown flag $charsequence"
    }

    proc tag_to_ascii {t} {
        set fmt [string repeat %c [string length $t]]
        set declist [scan $t $fmt]
        #unicode Tags block - e0000 to e007f
        set declist [lmap dec $declist {
                if {$dec < 917504 || $dec > 917631} {
                        error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in unicode Tags block range 917504-917631 (e0000-e007f)"
                }
                incr dec -917504
        }]
        return [format $fmt {*}$declist]
    }
    proc tag_from_ascii {a} {
        set fmt [string repeat %c [string length $a]]
        set declist [scan $a $fmt]
        set declist [lmap dec $declist {
                if {$dec > 127} {
                error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in ascii range 0-127"
                }
                incr dec 917504
        }]
        return [format $fmt {*}$declist]
    }

    #split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
    proc combiner_split {text} {
        #split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
        #
        #set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
        set graphemes [list]
        if {[tcl::string::length $text] == 0} {
            return {}
        }
        set list [list]
        set start 0
        set strlen [tcl::string::length $text]
        #make sure our regexes aren't non-greedy - or we may not have exit condition for loop
        #review
        while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
            lassign $match      matchStart matchEnd
            #puts "->start $start ->match $matchStart $matchEnd"
            lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd]
            set start [expr {$matchEnd+1}]
        }
        lappend list [tcl::string::range $text $start end]
    }

    #ZWJ ZWNJ ?
    #SWSP ?

    #1st shot - basic diacritics
    #todo - become aware of unicode grapheme cluster boundaries
    #This is difficult in Tcl without unicode property based Character Classes in the regex engine
    #review - this needs to be performant - it is used a lot by punk terminal/ansi features
    #todo - trie data structures for unicode?
    #for now we can at least combine diacritics
    #should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
    #Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
    #This still leaves a whole class of clusters.. korean etc unhandled :/
    #todo - tk::startOfCluster / tk::endOfCluster - try to get it brought into Tcl
    #https://core.tcl-lang.org/tcl/tktview/a4c7eeaf63
    proc grapheme_split {text {return list}} {
        #we should treat \r\n as a single grapheme cluster (as tk::endOfCluster does)
        set components [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set clist [split $pt ""]
            #review
            #lset clist end [tcl::string::cat [lindex $clist end] $combiners]
            ledit clist end end [tcl::string::cat [lindex $clist end] $combiners]
            lappend components {*}$clist
            #lappend components {*}[lrange $clist 0 end-1]
            #lappend components [tcl::string::cat [lindex $clist end] $combiners]
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend components {*}[split [lindex $csplits end] ""]
        }
        #we have a list of diacritically assembled components - we now need to combine any ZWJ sequences into single grapheme clusters as well as \r\n

        #https://www.unicode.org/Public/17.0.0/ucd/auxiliary/GraphemeBreakTest.html

        #Emoji modifiers \U1f3fb to \U1f3ff are also zero-width and combine with the preceding char
        #they don't form a 'base' for combining with other characters though.

        #blocks that can contain zero-width joiners and combining chars that combine with the preceding char:
        # Emoticons (1F600–1F64F)
        # Miscellaneous Symbols and Pictographs (1F300–1F5FF)
        # Supplemental Symbols and Pictographs (1F900–1F9FF)
        # Symbols and Pictographs Extended-A (1FA70–1FAFF)
        # Transport and Map Symbols (1F680–1F6FF)
        # Dingbats (2700–27BF)
        # Miscellaneous Symbols (2600–26FF)

        #RI (regional indicators) (\U1f1e6-\U1f1ff) combine in pairs to form flag emojis - but they don't combine with modifiers or ZWJs to form longer clusters.

        #review \uFE0F variation selector 16 - forces emoji presentation for preceding char

        #This is a basic implementation that does not check that all combinations are valid.
        set graphemes [list]
        set current_cluster ""

        set cluster_base 0  ;#is the current cluster based on a char that can be combined with modifiers/ZWJs (e.g emoji or other cluster-based char)
        #                    or is it based on a char that can't be combined with modifiers/ZWJs (e.g ascii letter)
        set cluster_base_RI 0 ;#is the current cluster based on a regional indicator char - which can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.

        set current_cluster_is_extensible 0
        for {set i 0} {$i < [llength $components] } {incr i} {
            set component [lindex $components $i]
            if {$component eq "\r" && [lindex $components $i+1] eq "\n"} {
                if {$current_cluster ne ""} {
                    lappend graphemes $current_cluster
                }
                lappend graphemes "\r\n"
                incr i ;#skip the \n as we've already processed it as part of the cluster
                set current_cluster ""
                set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                set current_cluster_is_extensible 0
            } elseif {$component eq "\u200d"} {
                if {$current_cluster eq ""} {
                    #ZWJ at start of string - treat as separate grapheme cluster - but isn't a valid base for further combining with more ZWJs or modifiers
                    set current_cluster $component
                    set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                    set current_cluster_is_extensible 0
                } else {
                    if {$cluster_base} {
                        if {$current_cluster_is_extensible} {
                            #a double (or longer) ZWJ sequence in a row is part of the last cluster - but not extensible anymore.
                            append current_cluster $component
                            set current_is_cluster_extensible 0
                        } else {
                            append current_cluster $component
                            if {$cluster_base_RI} {
                                #regional indicators can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
                                set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                                set current_cluster_is_extensible 0
                                #we can keep adding ZWJs or modifiers though
                            } else {
                                set current_cluster_is_extensible 1
                            }
                        }
                    } else {
                        #ZWJ after non-cluster-based char - non extensible but we continue appending ZWJs to the current cluster.
                        append current_cluster $component
                        set current_cluster_is_extensible 0
                    }

                }
            } elseif {[regexp {[\U1f3fb-\U1f3ff]} $component]} {
                #emoji modifier - join with previous component
                if {$current_cluster eq ""} {
                    #modifier at start of string -  not a valid base for further combining with more modifiers or ZWJs - but we continue appending modifiers to the current cluster.
                    set current_cluster $component
                    set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                } else {
                    if {$cluster_base} {
                        if {$current_cluster_is_extensible} {
                            append current_cluster $component
                            #invalidate the base!
                            set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                        } else {
                            append current_cluster $component
                        }
                    } else {
                        #modifier after non-cluster-based char - non extensible but we continue appending modifiers to the current cluster.
                        append current_cluster $component
                    }
                    #review
                    # \u1f33e\u1f3fe\u200d\u2f3fe\u200d\u1f33e is 2 clusters
                    #This is because after first zwj, we applied a modifier - not a valid base.
                }
                set current_cluster_is_extensible 0
            } else {
                if {$current_cluster eq ""} {
                    grapheme_split::start_cluster $component
                } else {
                    #have existing cluster data
                    if {$current_cluster_is_extensible} {
                        #assert - if current_cluster_is_extensible then cluster_base should currently be true.
                        #if the current char is a base - we can append to existing cluster, but if it's not a base, then we start a new cluster even if we had seen a ZWJ before.
                        if {[regexp {[\U1f600-\U1f64f\U1f300-\U1f5ff\U1f900-\U1f9ff\U1fa70-\U1faff\U1f680-\U1f6ff\U2700-\U27bf\U2600-\u26ff]} $component]} {
                            append current_cluster $component
                            set cluster_base 1
                        } else {
                            lappend graphemes $current_cluster
                            set current_cluster $component
                            set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                        }
                        set current_cluster_is_extensible 0
                    } elseif {$cluster_base_RI} {
                        #regional indicators can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
                        if {[regexp {[\U1f1e6-\U1f1ff]} $component]} {
                            append current_cluster $component

                            #invalidate the base - we can't combine more than 2 RIs in a cluster, and they don't combine with modifiers or ZWJs to form longer clusters.
                            #we can however add more ZWJs or modifiers to the cluster - but they don't make it extensible for combining with more RIs
                            set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
                        } else {
                            #something else while RI cluster is open - end the current cluster and start a new one with the current char.
                            lappend graphemes $current_cluster
                            grapheme_split::start_cluster $component
                        }
                        set current_cluster_is_extensible 0
                    } else {
                        lappend graphemes $current_cluster
                        grapheme_split::start_cluster $component
                    }
                }
            }
        }
        if {$current_cluster ne ""} {
            lappend graphemes $current_cluster
        }
        if {$return eq "list"} {
             return $graphemes
        } else {
             return [dict create list $graphemes last_extensible $current_cluster_is_extensible base $cluster_base RI_base $cluster_base_RI]
        }
    }
    namespace eval grapheme_split {
        proc about {} {
            return "helper functions for the punk::char::grapheme_split proc - not for general use outside of that context"
        }
        proc reset_base {} {
            upvar cluster_base cluster_base
            upvar cluster_base_RI cluster_base_RI
            set cluster_base 0
            set cluster_base_RI 0
        }
        proc start_cluster {component} {
            upvar current_cluster current_cluster
            upvar cluster_base cluster_base
            upvar cluster_base_RI cluster_base_RI
            upvar current_cluster_is_extensible current_cluster_is_extensible
            set current_cluster $component
            #             emoticons      symbols/pictographs supplemental symbols/pictographs  extended-a symbols/pictographs   transport/map symbols    dingbats        misc symbols
            if {[regexp {[\U1f600-\U1f64f\U1f300-\U1f5ff\U1f900-\U1f9ff\U1fa70-\U1faff\U1f680-\U1f6ff\U2700-\U27bf\U2600-\u26ff]} $component]} {
                set cluster_base 1
                set cluster_base_RI 0
            } elseif {[regexp {[\U1f1e6-\U1f1ff]} $component]} {
                #regional indicator - can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
                set cluster_base 1
                set cluster_base_RI 1
            } else {
                #grapheme_split::reset_base
                set cluster_base 0
                set cluster_base_RI 0
            }
            #not extensible until we see a ZWJ.
            set current_cluster_is_extensible 0
        }
    }

    #https://thottingal.in/blog/2026/03/22/complex-scripts-in-terminal/#:~:text=Once%20shaping%20is%20complete%2C%20glyphs,a%20n%20c%20e%20%E2%8C%89
    #Malayalam test string with 3 ligatures, 7 codepoints - and should be 3 grapheme clusters.
    set str "സന്തോഷ്"
    # ------------------------------------------------------------------------------------------------------
    #test
    # ------------------------------------------------------------------------------------------------------
    proc grapheme_split_tk {string} {
        #note that the sequence \r\n is generally seen as a single grapheme cluster.

        #if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
        #    #only ascii (7 or 8 bit) - no joiners or unicode
        #    return [split $string {}]
        #}
        package require tk
        set i 0
        set graphemes [list]
        while {$i < [tcl::string::length $string]} {
            set aftercluster [tk::endOfCluster $string $i]
            lappend graphemes [string range $string $i $aftercluster-1]
            set i $aftercluster
        }
        return $graphemes
    }
    proc grapheme_split_dec {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]] ;#warning scan %c... slow for v large strings (e.g 400k+)
            set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
            lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
            lappend graphemes {*}$pt_decs
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]]
        }
        return $graphemes
    }
    proc grapheme_split_dec2 {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} $csplits {
            set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]]
            if {$combiners ne ""} {
                set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
                lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
            }
            lappend graphemes {*}$pt_decs
        }
        return $graphemes
    }
    proc grapheme_split2 {text} {
        set graphemes [list]
        set csplits [combiner_split $text]
        foreach {pt combiners} [lrange $csplits 0 end-1] {
            set clist [split $pt ""]
            lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners]
        }
        #last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
        if {[lindex $csplits end] ne ""} {
            lappend graphemes {*}[split [lindex $csplits end] ""]
        }
        return $graphemes
    }

    # -- --- --- --- ---
    #will accept a single char or a string - test using console cursor position reporting
    #unreliable
    proc char_info_testwidth {ch {emit 0}} {
        package require punk::console
        #uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph
        tailcall punk::console::test_char_width $ch $emit
    }
    proc char_info_testwidth_cached {char} {
        variable charinfo
        set dec [scan $char %c]
        set twidth ""
        if {[tcl::dict::exists $charinfo $dec testwidth]} {
            set twidth [tcl::dict::get $charinfo $dec testwidth]
        }
        if {$twidth eq ""} {
            set width [char_info_testwidth $char]
            tcl::dict::set charinfo $dec testwidth $width
            return $width
        } else {
            return $twidth
        }
    }
    proc char_info_is_testwidth_cached {char} {
        variable charinfo
        return [tcl::dict::exists $charinfo [scan $char %c] testwidth]
    }
    # -- --- --- --- ---


}


tcl::namespace::eval punk::char::lib {
    variable num_superscript
    #digits and a small set of related symbols
    set num_superscript [list {*}{
        i   \u2071
        0   \u2070
        1   \u00B9
        2   \u00B2
        3   \u00B3
        4   \u2074
        5   \u2075
        6   \u2076
        7   \u2077
        8   \u2078
        9   \u2079
        +   \u207A
        -   \u207B
        =   \u207C
        (   \u207D
        )   \u207E
        n   \u207F
    }]
    variable num_supersub_re
    set num_supersub_re {^[0-9in+\-\(\)\=]+$}
    proc superscript_number {n} {
        if {$n eq ""} {return ""}
        variable num_superscript
        variable num_supersub_re
        if {![regexp $num_supersub_re $n]} {
            error "Cannot convert string '$n' to superscript. valid chars: [dict keys $num_superscript]"
        }
        return [string map $num_superscript $n]
    }
    set num_subscript [list {*}{
        i       \u1D62
        0       \u2080
        1       \u2081
        2       \u2082
        3       \u2083
        4       \u2084
        5       \u2085
        6       \u2086
        7       \u2087
        8       \u2088
        9       \u2089
        +       \u208A
        -       \u208B
        =       \u208C
        (       \u208D
        )       \u208E
        n       \u2099
    }]
    proc subscript_number {n} {
        if {$n eq ""} {return ""}
        variable num_subscript
        variable num_supersub_re
        if {![regexp $num_supersub_re $n]} {
            error "Cannot convert string '$n' to subcript. valid chars: [dict keys $num_subscript]"
        }
        return [string map $num_subscript $n]
    }
}

# -----------------------------------------------------------------------------
#    register namespace(s) to have PUNKARGS,PUNKARGS_aliases variables checked
# -----------------------------------------------------------------------------
#    variable PUNKARGS
#    variable PUNKARGS_aliases
namespace eval ::punk::args::register {
    #use fully qualified so 8.6 doesn't find existing var in global namespace
    lappend ::punk::args::register::NAMESPACES ::punk::char
}


# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
##  Ready
package provide punk::char [tcl::namespace::eval punk::char {
    variable version
    set version 0.1.0
}]
return

#*** !doctools
#[manpage_end]