You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

3411 lines
162 KiB

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from <pkg>-buildversion.txt
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2023
#
# @@ Meta Begin
# Application punk::char 0.1.0
# Meta platform tcl
# Meta license <unspecified>
# @@ Meta End
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::char 0 0.1.0]
#[copyright "2024"]
#[titledesc {character-set and unicode utilities}] [comment {-- Name section and table of contents description --}]
#[moddesc {character-set nad unicode}] [comment {-- Description at end of page heading --}]
#[require punk::char]
#[keywords module encodings]
#[description]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Overview]
#[para] overview of punk::char
#[subsection Concepts]
#[para] -
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
##e.g package require frobz
#*** !doctools
#[subsection dependencies]
#[para] packages used by punk::char
#[list_begin itemized]
#[item] [package {Tcl 8.6}]
#
#*** !doctools
#[item] [package {overtype}]
#[para] -
#[item] [package {textblock}]
#[para] -
#[item] [package console]
#[para] -
package require Tcl 8.6-
#dependency on tcllib not ideal for bootstrapping as punk::char is core to many features.. (textutil::wcswidth is therefore included in bootsupport/include_modules.config) review
package require textutil
package require textutil::wcswidth
#*** !doctools
#[list_end]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section API]
#Note that ansi escapes can begin with \033\[ (\u001b\[) or the single character "Control Sequence Introducer" 0x9b
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::char {
tcl::namespace::export *
variable grapheme_widths [tcl::dict::create]
# -- --------------------------------------------------------------------------
variable encmimens ;#namespace of mime package providing reversemapencoding and mapencoding functions
#tcllib mime requires tcl::chan::memchan,events,core and/or Trf
if {![catch {package require punk::encmime} errM]} {
set encmimens ::punk::encmime
} else {
package require mime
set encmimens ::mime
}
# -- --------------------------------------------------------------------------
variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously
variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common?
#more useful for referring to ANSI documentation would be a proper 7-bit and 8-bit 'Code Table' layout
#as described in ECMA-35 5.2
# where the positions of the table are in one-to-one correspondence with the bit combinations of the code.
#- for 7-bit: 8 columns 16 rows
#- for 8-bit 16 columns 16 rows
proc codetable {which} {
set bits 8
switch -- $which {
ascii8 {
set which default
}
ascii {
set bits 7
}
default {
if {$which ni [encoding names]} {
error "codetable unsupported - use 'ascii' or an entry from the result of the 'encoding names' command."
}
}
}
package require punk::ansi
set hibit_count [expr {$bits-4}]
set bitcolumns [expr {2**$hibit_count}] ;#always 4 bits for the rows - remaining bits for the columns
set columncount [expr {$bitcolumns + 6}]
#set header1 [list "" "b7\nb6\nb5" "0\n0\n0" "0\n0\n1" "0\n1\n0" "0\n1\n1" "1\n0\n0" "1\n0\n1" "1\n1\n0" "1\n1\n1"]
set header1 [list]
set hibits_label ""
set indent ""
for {set hb $bits} {$hb > 4} {incr hb -1} {
append hibits_label ${indent}b$hb\n
append indent " "
}
set hibits_label [string range $hibits_label 0 end-1]
lappend header1 $hibits_label "" "" "" "" ""
for {set colidx 0} {$colidx < $bitcolumns} {incr colidx} {
set binval [format %0${hibit_count}b $colidx]
set binvalbits [split $binval ""]
set indent ""
set display_hibits ""
foreach bb $binvalbits {
append display_hibits $indent$bb\n
append indent " "
}
set display_hibits [string range $display_hibits 0 end-1]
lappend header1 $display_hibits
}
#\u2193 down arrow
#right-down arrows
#\u2ba7
#\u21b4
#\u2b0e
set header2 [list " Bits" b4 b3 b2 b1 "column \u2192\nrow \u2b0e" {*}[punk::lib::range 0 $bitcolumns-1]]
set headers [list $header1 $header2]
#set t [textblock::table -return tableobject -rows $rows]
set t [textblock::table -return tableobject]
#todo - fix textblock::table to allow configure -columncount
for {set c 0} {$c < $columncount} {incr c} {
$t add_column
}
set vheaders [punk::transpose_equal_lists $headers]
set hidx -1
foreach vh $vheaders {
incr hidx
$t configure_column $hidx -headers $vh
}
$t configure_header 0 -colspans [list 6 0 0 0 0 0 {*}[lrepeat $bitcolumns 1]]
$t configure_column 0 -blockalign left
#always 16 rows - remaining bits form the columns
for {set ridx 0} {$ridx <= 15} {incr ridx} {
set charlist [list]
set lowbits [format %04b $ridx]
for {set i 0} {$i < $bitcolumns} {incr i} {
set hibits [format %0${hibit_count}b $i]
set ch [format %c [scan ${hibits}${lowbits} %b]]
#puts "-->${hibits}${lowbits} ch:$ch"
if {$which ne "default"} {
if {[catch {encoding convertfrom $which $ch} ch]} {
set ch [punk::ansi::a red bold]-[punk::ansi::a]
lappend charlist $ch
} else {
lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch]
}
} else {
lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch]
}
}
set r [list "" {*}[split $lowbits ""] $ridx {*}$charlist]
$t add_row $r
}
set result [$t print]
$t destroy
return $result
}
#just the 7-bit ascii. use [page ascii] for the 8-bit layout
proc ascii {} {return {
00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL
08 BS 09 HT 0a LF 0b VT 0c FF 0d CR 0e SO 0f SI
10 DLE 11 DC1 12 DC2 13 DC3 14 DC4 15 NAK 16 SYN 17 ETB
18 CAN 19 EM 1a SUB 1b ESC 1c FS 1d GS 1e RS 1f US
20 SP 21 ! 22 " 23 # 24 $ 25 % 26 & 27 '
28 ( 29 ) 2a * 2b + 2c , 2d - 2e . 2f /
30 0 31 1 32 2 33 3 34 4 35 5 36 6 37 7
38 8 39 9 3a : 3b ; 3c < 3d = 3e > 3f ?
40 @ 41 A 42 B 43 C 44 D 45 E 46 F 47 G
48 H 49 I 4a J 4b K 4c L 4d M 4e N 4f O
50 P 51 Q 52 R 53 S 54 T 55 U 56 V 57 W
58 X 59 Y 5a Z 5b [ 5c \ 5d ] 5e ^ 5f _
60 ` 61 a 62 b 63 c 64 d 65 e 66 f 67 g
68 h 69 i 6a j 6b k 6c l 6d m 6e n 6f o
70 p 71 q 72 r 73 s 74 t 75 u 76 v 77 w
78 x 79 y 7a z 7b { 7c | 7d } 7e ~ 7f DEL
}}
#G0 character set
proc ascii2 {} {
set dict [asciidict2]
set out ""
set i 1
append out " "
tcl::dict::for {k v} $dict {
#single chars are wrapped with \033(0 and \033(B ie total length 7
if {[tcl::string::length $v] == 7} {
set v " $v "
} elseif {[tcl::string::length $v] == 2} {
set v "$v "
} elseif {[tcl::string::length $v] == 0} {
set v " "
}
append out "$k $v "
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
set out [tcl::string::trimright $out " "]
return $out
}
namespace eval argdoc {
variable PUNKARGS
lappend PUNKARGS [list {
@id -id ::punk::char::ascii2doublewide
@cmd -name punk::char::ascii2doublewide\
-summary\
"string to double-wide unicode char"\
-help\
"Convert parts of string in the ascii range 21 to 7E to corresponding chars from the unicode
'Halfwidth and Fullwidth Forms' block.
The space character (0x20) is converted to the 'IDEOGRAPHIC SPACE' character (0x3000)
Control chars and chars outside the ascii range are not converted and passed through as-is."
@values -min 1 -max 1
str -type string
}]
}
proc ascii2doublewide {str} {
#set base to \UFEE0 and add ascii value of char to get the double-wide character in the 'Halfwidth and Fullwidth Forms' block of unicode
set out ""
foreach ch [split $str ""] {
set decval [scan $ch %c]
if {$decval == 0x20} {
#space char maps to ideographic space rather than the fullwidth space char - as the fullwidth space char is often rendered as a narrow space rather than a wide one - and the ideographic space is more likely to be rendered as a wide space.
set decval 0x3000
} elseif {$decval < 0x21 || $decval > 0x7E} {
#chars outside the ascii range are not converted - pass through as-is
set decval [scan $ch %c]
} else {
set decval [expr {0xFEE0 + $decval}]
}
append out [format %c $decval]
}
return $out
}
proc doublewide2ascii {str} {
set out ""
foreach ch [split $str ""] {
set decval [scan $ch %c]
if {$decval == 0x3000} {
#ideographic space maps to ascii space char
set decval 0x20
append out \U3000
} elseif {$decval < 0xFF01 || $decval > 0xFF5E} {
#chars outside the part of the 'Halfwidth and Fullwidth Forms' block that corresponds to ascii are not converted - pass through as-is
append out $ch
} else {
set decval [expr {$decval - 0xFEE0}]
append out [format %c $decval]
}
}
return $out
}
proc ascii2NTFSPUA {str} {
#set base to \uF000 and add ascii value of char to get the PUA character in the 'Private Use Area' block of unicode
set out ""
foreach ch [split $str ""] {
set decval [scan $ch %c]
if {$decval <= 0x7F} {
append out [format %c [expr {0xF000 + $decval}]]
} else {
#chars outside the ascii range are not converted - pass through as-is
append out $ch
}
}
return $out
}
proc NTFSPUA2ascii {str} {
set out ""
foreach ch [split $str ""] {
set decval [scan $ch %c]
if {$decval >= 0xF000 && $decval <= 0xF0FF} {
set decval [expr {$decval - 0xF000}]
append out [format %c $decval]
} else {
#chars outside the 'Private Use Area' block that corresponds to ascii are not converted - pass through as-is
append out $ch
}
}
return $out
}
proc symbol {} {
tailcall page symbol
}
proc dingbats {} {
set out ""
append out [page dingbats] \n
set unicode_dict [charset_dictget Dingbats]
append out " "
set i 1
tcl::dict::for {k charinfo} $unicode_dict {
set char [tcl::dict::get $charinfo char]
if {[tcl::string::length $char] == 0} {
set displayv " "
} elseif {[tcl::string::length $char] == 1} {
set displayv " $char "
} else {
set displayv $char
}
append out "$k $displayv "
if {$i > 0 && $i % 8 == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
return $out
}
proc page_names {{search *}} {
set all_names [list]
set d [page_names_dict $search]
tcl::dict::for {k v} $d {
if {$k ni $all_names} {
lappend all_names $k
}
foreach m $v {
if {$m ni $all_names} {
lappend all_names $m
}
}
}
return [lsort $all_names]
}
proc page_names_help {{namesearch *}} {
set d [page_names_dict $namesearch]
set out ""
tcl::dict::for {k v} $d {
append out "$k $v" \n
}
return [linesort $out]
}
proc page_names_dict {{search *}} {
if {![regexp {[?*]} $search]} {
set search "*$search*"
}
set encnames [encoding names]
foreach enc $encnames {
tcl::dict::set d $enc [list]
}
variable encmimens
set mimenames [array get ${encmimens}::reversemap]
tcl::dict::for {mname encname} $mimenames {
if {$encname in $encnames} {
set enclist [tcl::dict::get $d $encname]
if {$mname ni $enclist} {
tcl::dict::lappend d $encname $mname
}
}
}
foreach enc [lsort $encnames] {
set mime_enc [${encmimens}::mapencoding $enc]
if {$mime_enc ne ""} {
set enclist [tcl::dict::get $d $enc]
if {$mime_enc ni $enclist} {
tcl::dict::lappend d $enc $mime_enc
}
}
}
set dresult [tcl::dict::create]
if {$search ne "*"} {
tcl::dict::for {k v} $d {
if {[tcl::string::match -nocase $search $k] || ([lsearch -nocase $v $search]) >= 0} {
tcl::dict::set dresult $k $v
}
}
} else {
set dresult $d
}
return $dresult
}
proc page8 {encname args} {
tcl::dict::set args -cols 8
tailcall page $encname {*}$args
}
proc page16 {encname args} {
tcl::dict::set args -cols 16
tailcall page $encname {*}$args
}
#This will not display for example, c0 glyphs for cp437
# we could use the punk::ansi::cp437_map dict - but while that might be what some expect to see - it would probably be too much magic for this function - which is intended to align more with what Tcl's encoding convertfrom/to actually does.
# for nonprinting members of the page, 2 and 3 letter codes are used rather than unicode visualisation replacements or even unicode equivalent replacements known to be appropriate for the page
proc page {encname args} {
variable invalid
set encname [encname $encname]
set defaults [list {*}{
-range {0 256}
-cols 16
}]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- --- --- --- --- --- ---
set cols [tcl::dict::get $opts -cols]
# -- --- --- --- --- --- --- --- ---
set d_bytedisplay [basedict_display]
#set d_ascii [pagedict_raw ascii]
set d_ascii [basedict]
set d_asciiposn [lreverse $d_ascii] ;#values should be unique except for "???". We are mainly interested in which ones have display-names e.g cr csi
#The results of this are best seen by comparing the ebcdic and ascii pages
set d_page [pagedict_raw $encname]
set out ""
set i 1
append out " "
tcl::dict::for {k rawchar} $d_page {
set num [expr {"0x$k"}]
#see if ascii equivalent exists and has a name
if {$rawchar eq $invalid} {
set displayv "$invalid"
} else {
set bytedisplay ""
if {[tcl::dict::exists $d_asciiposn $rawchar]} {
set asciiposn [tcl::dict::get $d_asciiposn $rawchar]
set bytedisplay [tcl::dict::get $d_bytedisplay $asciiposn]
}
if {$bytedisplay eq $invalid} {
#
set displayv " $rawchar "
} else {
set displaylen [tcl::string::length $bytedisplay]
if {$displaylen == 2} {
set displayv "$bytedisplay "
} elseif {$displaylen == 3} {
set displayv $bytedisplay
} else {
if {[tcl::string::length $rawchar] == 0} {
set displayv " "
} else {
#presumed 1
set displayv " $rawchar "
}
}
}
}
append out "$k $displayv "
if {$i > 0 && $i % $cols == 0} {
set out [tcl::string::range $out 0 end-2]
append out \n " "
}
incr i
}
set out [tcl::string::trimright $out " "]
return $out
}
proc pagechar1 {page num} {
set encpage [encname $page]
encoding convertfrom $encpage [format %c $num]
}
proc pagechar {page num} {
set encpage [encname $page]
set ch [format %c $num]
if {[decodable $ch $encpage]} {
set outchar [encoding convertfrom $encpage $ch]
} else {
#here we will use \0xFFFD instead of our replacement string ??? - as the name pagechar implies always returning a single character. REVIEW.
set outchar $::punk::char::invalid_display_char
}
return $outchar
}
proc pagechar_info {page num} {
set ch [format %c $num]
set h [format %04x $num]
set encpage [encname $page]
if {[decodable $ch $encpage]} {
set outchar [encoding convertfrom $encpage $ch]
} else {
error "pagechar_info: $h not decodable from $encpage"
}
package require punk::console
puts -nonewline stdout \033\[s;flush stdout
lassign [punk::console::get_cursor_pos_list] _row1 col1
puts -nonewline stdout "$outchar";flush stdout
lassign [punk::console::get_cursor_pos_list] _row2 col2
puts -nonewline stdout "\033\[u";flush stdout
return "$col1 -> $col2"
}
proc pagebyte {page num} {
set encpage [encname $page]
set ch [format %c $num]
if {[decodable $ch $encpage]} {
#set outchar [encoding convertto $encpage [format %c $num]]
set outchar [format %c $num]
} else {
set outchar $::punk::char::invalid_display_char
}
return $outchar
}
proc all_pages {} {
set out ""
set mimenamesdict [page_names_dict]
foreach encname [encoding names] {
if {[tcl::dict::exists $mimenamesdict $encname]} {
set alt "([tcl::dict::get $mimenamesdict $encname])"
} else {
set alt ""
}
append out "$encname $alt" \n
append out [page $encname]
}
return $out
}
proc encname {encoding_name_or_alias} {
set encname $encoding_name_or_alias
if {[lsearch -nocase [page_names] $encname] < 0} {
error "Unknown encoding '$encname' - use 'punk::char::page_names' to see valid encoding names on this system"
}
variable encmimens
if {$encname ni [encoding names]} {
set encname [${encmimens}::reversemapencoding $encname]
}
return $encname
}
proc pagedict_raw {encname} {
variable invalid ;# ="???"
set encname [encname $encname]
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
#tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
set ch [format %c $i] ;
#jmn
if {[decodable $ch $encname]} {
#set encchar [encoding convertto $encname $ch]
#tcl::dict::set d $k [encoding convertfrom $encchar]
tcl::dict::set d $k [encoding convertfrom $encname $ch]
} else {
tcl::dict::set d $k $invalid ;#use replacement so we can detect difference from actual "?"
}
}
return $d
}
proc asciidict {} {
variable invalid
set d [tcl::dict::create]
set a128 [asciidict128]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
if {$i <= 127} {
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
#
tcl::dict::set d $k $invalid
}
if {$i <=32} {
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
if {$i == 0x9b} {
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
} else {
tcl::dict::set d $k [format %c $i]
}
}
}
return $d
}
proc basedict_display {} {
set d [tcl::dict::create]
set a128 [asciidict128]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
if {$i <=32} {
#no point emitting the lower control characters raw to screen - emit the short-names defined in the 'ascii' proc
tcl::dict::set d $k [tcl::dict::get $a128 $k]
} else {
if {$i == 0x9b} {
tcl::dict::set d $k CSI ;#don't emit the ansi 'Control Sequence Introducer' - or it will be interpreted by the console and affect the layout.
} elseif {$i == 0x9c} {
tcl::dict::set d $k OSC
} else {
#tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
#don't use encoding convertfrom - we want the value independent of the current encoding system.
tcl::dict::set d $k [format %c $i]
}
}
}
return $d
}
proc basedict_encoding_system {} {
#result depends on 'encoding system' currently in effect
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [encoding convertfrom [encoding system] [format %c $i]]
}
return $d
}
proc basedict {} {
#this gives same result independent of current value of 'encoding system'
set d [tcl::dict::create]
for {set i 0} {$i < 256} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [format %c $i]
}
return $d
}
proc pagedict {pagename args} {
variable charsets
set encname [encname $pagename]
set defaults [list {*}{
-range {0 255}
-charset ""
}]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- --- --- --- --- --- --- ---
set range [tcl::dict::get $opts -range]
set charset [tcl::dict::get $opts -charset]
# -- --- --- --- --- --- --- --- --- ---
if {$charset ne ""} {
if {$charset ni [charset_names]} {
error "unknown charset '$charset' - use 'charset_names' to get list"
}
set setinfo [tcl::dict::get $charsets $charset]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
#set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
break
}
} else {
set start [lindex $range 0]
set end [lindex $range 1]
}
set d [tcl::dict::create]
for {set i $start} {$i <= $end} {incr i} {
set k [format %02x $i]
tcl::dict::set d $k [encoding convertfrom $encname [format %c $i]]
}
return $d
}
#todo - benchmark peformance - improve punk pipeline
proc asciidict128 {} {
regexp -all -inline {\S+} [concat {*}[linelist -line trimleft [ascii]]]
}
proc _asciidict128 {} {
.= ascii |> .=> linelist -line trimleft |> .=* concat |> {regexp -all -inline {\S+} $data}
}
#review - use terminal to display actual supported DEC specials vs using dict at: punk::ansi::map_special_graphics which maps to known unicode equivalents
proc asciidict2 {} {
set d [tcl::dict::create]
tcl::dict::for {k v} [basedict_display] {
if {[tcl::string::length $v] == 1} {
set num [expr {"0x$k"}]
#tcl::dict::set d $k "\033(0[subst \\u00$k]\033(B"
tcl::dict::set d $k "\033(0[format %c $num]\033(B"
} else {
tcl::dict::set d $k $v
}
}
return $d
}
#-- --- --- --- --- --- --- ---
# encoding convertfrom & encoding convertto can be somewhat confusing to think about. (Need to think in reverse.)
# e.g encoding convertto dingbats <somethingpretty> will output something that doesn't look dingbatty on screen.
#-- --- --- --- --- --- --- ---
#must use Tcl instead of tcl (at least for 8.6)
if {![package vsatisfies [package present Tcl] 8.7-]} {
proc encodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {($encname eq "ascii")} {
#8.6 fails to round-trip convert 0x7f del character despite it being in the ascii range (review Why?? what else doesn't round-trip but should?)
#just strip it out of the string as we are only after a boolean answer and if s is only a single del char empty string will result in true
set s [tcl::string::map [list [format %c 0x7f] ""] $s]
}
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
}
#note also that tcl8.6 has anomalies with how it handles some unassigned codepoints
# e.g unassigned codes in the middle of a codepage may appear to be encodable&decodable in a round trip whereas undefined codepoints at the end may get the replacement character defined in the tcl encodings dir (usually the 3f char: "?")
proc decodable "s {enc [encoding system]}" {
set encname [encname $enc]
#review
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
}
} else {
#review - use -profile?
proc encodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {![catch {
string eq $s [encoding convertfrom $encname [encoding convertto $encname $s]]
} result]} {
return $result
} else {
return 0
}
}
proc decodable "s {enc [encoding system]}" {
set encname [encname $enc]
if {![catch {
string eq $s [encoding convertto $encname [encoding convertfrom $encname $s]]
} result]} {
return $result
} else {
return 0
}
}
}
#-- --- --- --- --- --- --- ---
proc test_japanese {{encoding jis0208}} {
#A very basic test of 2char encodings such as jis0208
set yatbun ;# encoding convertfrom jis0208 F|K\\
lassign [split $yatbun] yat bun
puts "original yatbun ${yat} ${bun}"
set eyat [encoding convertto $encoding $yat]
set ebun [encoding convertto $encoding $bun]
puts "$encoding encoded: ${eyat} ${ebun}"
puts "reencoded: [encoding convertfrom $encoding $eyat] [encoding convertfrom $encoding $ebun]"
return $yatbun
}
proc test_grave {} {
set g [format %c 0x300]
puts stdout "Testing console display of grave accented a in between letters x and y - accent should combine over the top of the letter a."
puts stdout "Apparent width should theoretically be 1 console-column"
package require punk::console
puts stdout "# -- --- --- ---"
puts -nonewline "xa${g}z";set cursorposn [punk::console::get_cursor_pos]
puts stdout \n
puts stdout "cursor position immediately after outputing 4 bytes (expecting 3 glyphs): $cursorposn"
puts stdout "# -- --- --- ---"
puts -nonewline "xyz";set cursorposn [punk::console::get_cursor_pos]
puts stdout \n
puts stdout "cursor position immediately after outputing 3 bytes (xyz): $cursorposn"
}
proc test_zalgo {} {
#from: https://github.com/jameslanska/unicode-display-width/blob/5e28d94c75e8c421a87199363b85c90dc37125b8/docs/unicode_background.md
#see: https://lingojam.com/ZalgoText
puts stdout "44 chars long - 9 graphemes - 9 columns wide"
set str "̌á̲l͔̝̞̄̑͌g̖̘̘̔̔͢͞͝o̪̔T̢̙̫̈̍͞e̬͈͕͌̏͑x̺̍̓̓ͅ"
}
proc test_zalgo2 {} {
# ------------------------
set str "Z̸̢͉̣͔̭̪͙̖̳̘͈͍̤̩̟͈͈͕̯̅̏̆̓̌́́͌̿̕ͅą̷̦̤̫̩͕̥̐̎̓́́̂̀͆́̔̄̈́̏̌́̆͜͜͝͠l̴̩̙̺͚̟͇͖͔͕̹̟͌̈́̄̇́̉̋̕̚͜͠͠g̸͇̩͔̺̝̗̥̖̙̑̈́̈́̿̾̌͌͊̈̀͊̑̇̑͌̍̅̌͊͜͠͝ǫ̴̢̧͎͔̞̤̻̱̮̬͕̗̭͍̣̠̳̆̀̋̓͒̾̄͜͝͠͝T̴̛͉̬̋́̈́̈́̓͗̍̏̔̈̋̀͐̓̎̅̚̕͠͝͝ê̵̖̖̈͊̔̓̔̐̓̃͊̋͋̎͂͋̕x̴̡͎͖̼͎̦͎̲̪̘̺̯̲̬̮̥̼̰͌̃͜ͅt̶̛̘͎̰̔͐͌́̈́͊͗͌̅͂̐̆͌̀͂͘"
# ------------------------
}
proc test_zalgo3 {} {
# ------------------------
set str "Ẕ̸̢̼̺̜̰̣̣̖̭̜͓͖̺̥̼̠͙͙̟̥̟̠̤̫͍̠̤̮͚̝̜̙͈̦̙̩̹̙̜̩̦͔͕̈̃̅̇̈́͂̇̽̈́́́́̎͊̂̑̋͆̔̾͋̚͜ͅã̵̛̪̽̎̃͑̒̇͋͂̽̃̍͊̀̈̊̊̔̊̆̈́͗͑͗̽̄̋͗̄͌̑͊͝͝͠ͅl̵̢͇̖͉͖̝̹̜̞͓͎͍͈̞̱̙͙̦̪͔̱̮͈͉̼͎̭̝̯͇͚̺̟̱͙̳̰̙͚͖̝̫͙̎̅̃͆̈́̋̌̔̋̋͗̈́̔̐͆̈́̓̾̄̀́̏́͒̆̌͒̈́̈́̾̏̀͜͝g̸̖͂͋̊͗̈́̓̆̈̋̒͐̕o̶̧̢͓̲̗̠̘͕̦̤̹̗͉̰͔̯͓̭̹̻͔͇̯̜̙̍̂̃̃̀̓͌̒͊̊̋̒̿̿̌͐̄͗̾̕͝T̶̛̳̜̰͍̹̻̫̠̱̼̼̙̆̑̾̾͛́́̿͋͌̎̀̀̽̆͌̽̂̈̇̅̇̃́̿͗̾͒̎͊̑̾͝͠ȩ̸̢̨̛̛̛̭͔͎͇̫͎͈̲̻̙͕͎̣͈̩̺̗͖͙͇͌͐̒̎͐̓́̉́͐̓̐͌̅̊͋͑̈́͗͑̏̕͜͜͝ͅx̸̧̧̛̖̣̥̘͎͎̳̭̙̦̝͖̝̮̱̹̯̺̙̩̯̮͖̻̰͓̰̩͇̥̑͌̊̐̉̏͆̓̑̎̊̓͒̂̄̆͆̀̊̄̈̽͐͛̏͊̓̌͑́̎̈́̍̈́̊͗̉̋͆̿̊͘͜͜͝͝ͅͅͅt̵̡̨̜̘͎̦͚̠̗̺͉̼̠̲̠͙̺̹̗̲̏̈́̂̚͜͜͝ͅ"
# ------------------------
}
proc test_farmer {} {
#an interesting article re grapheme clustering problems in terminals https://mitchellh.com/writing/grapheme-clusters-in-terminals
#(similar to the problem with grave accent rendering width that the test_grave proc is written for)
# -- --- --- --- ---
#These pasted glyphs can display in console even when the unicode versions don't (tcl 8.6 limited to 65533/FFFD ?)
upvar farmer1_paste test_farmer1
upvar farmer2_paste test_farmer2
set test_farmer1 🧑🌾 ;#contains zero-width joiner between
set test_farmer2 🧑🌾
puts "pasted farmer1 exporting as var farmer1_paste: $test_farmer1"
puts "pasted farmer2 exporting as var farmer2_paste: $test_farmer2"
# -- --- --- --- ---
set farmer1 "\U0001f9d1\U0000200d\U0001f33e"
set farmer2 "\U0001f9d1\U0001f33e"
puts stdout "farmer1 with zero-width joiner, codes: \\U0001f9d1\\U0000200d\\U0001f33e : $farmer1"
puts stdout "farmer2 with no joiner codes: \\U0001f9d1\\U001f33e : $farmer2"
package require punk::console
puts stdout \n
puts stdout "#2--5---9---C---"
puts -nonewline stdout \033\[2K\033\[1G ;#2K erase line 1G cursor at col1
puts -nonewline "${farmer1}";set cursorposn [punk::console::get_cursor_pos_list]
puts stdout "\ncursor position immediately after outputing farmer1 (expecting 1 glyph 2 wide) : cursor at col [lindex $cursorposn 1]"
if {[lindex $cursorposn 1] eq "3"} {
puts stdout "[a+ green]OK[a]"
} else {
puts stdout "[a+ red]ERR - expected cursor position to be 3 after emitting farmer1[a]"
}
puts stdout "----------------"
puts stdout "#2--5---9---C---"
puts -nonewline "${farmer2}";set cursorposn [punk::console::get_cursor_pos_list]
puts stdout "\ncursor position immediately after outputing farmer2 (expecting 2 glyphs 4 wide in total): cursor at col [lindex $cursorposn 1]"
if {[lindex $cursorposn 1] eq "5"} {
puts stdout "[a+ green]OK[a]"
} else {
puts stdout "[a+ red]ERR - expected cursor position to be 5 after emitting farmer2[a]"
}
puts stdout "----------------"
puts "returning farmer1 - should be single glyph"
return $farmer1
}
proc _show_graphemes {str} {
set tk_graphemes [punk::char::grapheme_split_tk $str]
puts "graphemes from tk:"
set i 0
foreach g $tk_graphemes {
puts " $i: '$g'"
incr i
}
set punk_graphemes [punk::char::grapheme_split $str]
puts "graphemes from punk::char::grapheme_split:"
set i 0
foreach g $punk_graphemes {
puts " $i: '$g'"
incr i
}
}
proc test_malayalam1 {} {
set str "മലയ"
puts "Malayalam string: $str"
_show_graphemes $str
puts "$str"
puts [punk::ansi::mark_columns -number below -marker {| .} {1 5 10} {2 3 4 6 7 8 9}]
return $str
}
proc test_malayalam2 {} {
set str "സന"
puts "Malayalam string: $str"
_show_graphemes $str
puts "$str"
puts [punk::ansi::mark_columns -number below -marker {| .} {1 5 10} {2 3 4 6 7 8 9}]
return $str
}
#G0 Sets Sequence G1 Sets Sequence Meaning
#ESC ( A ESC ) A United Kingdom Set
#ESC ( B ESC ) B ASCII Set
#ESC ( 0 ESC ) 0 Special Graphics
#ESC ( 1 ESC ) 1 Alternate Character ROM Standard Character Set
#ESC ( 2 ESC ) 2 Alternate Character ROM Special Graphic
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode character sets - some hardcoded - some loadable from data files
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
variable charinfo [tcl::dict::create]
variable charsets [tcl::dict::create]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Aggregate character sets (ones that pick various ranges from underlying unicode ranges)
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "WGL4" [list altname "Windows Glyph List 4" ranges [list {*}{
{start 0 end 127 name "basic latin"}
{start 128 end 255 name "latin-1 supplement"}
{start 256 end 383 name "Latin Extended-A"}
{start 402 end 402 name "subset Latin Extended-B"}
{start 506 end 511 name "subset Latin Extended-B"}
{start 710 end 711 name "subset Spacing Modifier Letters"}
{start 713 end 713 name "subset Spacing Modifier Letters"}
{start 728 end 733 name "subset Spacing Modifier Letters"}
{start 900 end 906 name "subset Greek"}
{start 908 end 908 name "subset Greek"}
{start 910 end 974 name "subset Greek"}
{start 1024 end 1119 name "subset Cyrillic"}
{start 1168 end 1169 name "subset Cyrillic"}
{start 7808 end 7813 name "subset Latin Extended Additional"}
{start 7922 end 7923 name "subset Latin Extended Additional"}
{start 8211 end 8213 name "subset General Punctuation"}
{start 8215 end 8222 name "subset General Punctuation"}
{start 8224 end 8226 name "subset General Punctuation"}
{start 8230 end 8230 name "subset General Punctuation"}
{start 8240 end 8240 name "subset General Punctuation"}
{start 8242 end 8243 name "subset General Punctuation"}
}] description "Microsoft WGL4 Repertoire" settype "other"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#The base page 0-256 8bit range - values don't have specific characters or descriptions - as they are codepage dependent
#we will fill this here for completeness - but with placeholders
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "8bit" [list ranges [list {start 0 end 127 name ascii} {start 128 end 255 name 8bit-ascii}] description "8bit extended ascii range" settype "other"]
for {set i 0} {$i < 256} {incr i} {
tcl::dict::set charinfo $i [list desc "codepage-dependent" short "byte_[format %02x $i]"]
}
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# Unicode ranges
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "greek" [list ranges [list {start 880 end 1023} {start 7936 end 8191}] description "Greek and Coptic" settype "other"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Block Elements" [list ranges [list {start 9600 end 9631}] description "Block Elements" settype "other"]
tcl::dict::set charinfo 9600 [list desc "Upper Half Block" short "blocke_up_half"]
tcl::dict::set charinfo 9601 [list desc "Lower One Eighth Block" short "blocke_lw_1_8th"]
tcl::dict::set charinfo 9602 [list desc "Lower One Quarter Block" short "blocke_lw_1_qtr"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Dingbats" [list ranges [list {start 9984 end 10175 }] description "Unicode Dingbats" settype "tcl_supplemented"]
tcl::dict::set charinfo 9984 [list desc "Black Safety Scissors" short "dingbats_black_safety_scissors"]
#...
tcl::dict::set charinfo 10175 [list desc "Double Curly Loop" short "dingbats_double_curly_loop"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#variation selectors 0xFe01 - 0xFE0F
tcl::dict::set charsets "Variation Selectors" [list ranges [list {start 65024 end 65039}] description "Variation Selectors" note "combining character with previous char - variant glyph display" settype "tcl_supplemented"]
tcl::dict::set charinfo 65024 [list desc "Variation Selector-1" short "VS1"]
tcl::dict::set charinfo 65025 [list desc "Variation Selector-2" short "VS2"]
tcl::dict::set charinfo 65026 [list desc "Variation Selector-3" short "VS3"]
tcl::dict::set charinfo 65027 [list desc "Variation Selector-4" short "VS4"]
tcl::dict::set charinfo 65027 [list desc "Variation Selector-5" short "VS5"]
tcl::dict::set charinfo 65029 [list desc "Variation Selector-6" short "VS6"]
tcl::dict::set charinfo 65030 [list desc "Variation Selector-7" short "VS7"]
tcl::dict::set charinfo 65031 [list desc "Variation Selector-8" short "VS8"]
tcl::dict::set charinfo 65032 [list desc "Variation Selector-9" short "VS9"]
tcl::dict::set charinfo 65033 [list desc "Variation Selector-10" short "VS10"]
tcl::dict::set charinfo 65034 [list desc "Variation Selector-11" short "VS11"]
tcl::dict::set charinfo 65035 [list desc "Variation Selector-12" short "VS12"]
tcl::dict::set charinfo 65036 [list desc "Variation Selector-13" short "VS13"]
tcl::dict::set charinfo 65037 [list desc "Variation Selector-14" short "VS14"]
tcl::dict::set charinfo 65038 [list desc "Variation Selector-15 text variation" short "VS15"] ;#still an image - just more suitable for text-presentation e.g word-processing doc
tcl::dict::set charinfo 65039 [list desc "Variation Selector-16 emoji variation" short "VS16"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
# emoticons https://www.unicode.org/charts/PDF/U1F600.pdf
tcl::dict::set charsets "Emoticons" [list ranges [list {start 128512 end 128591}] description "Emoticons" settype "tcl_supplemented"]
tcl::dict::set charinfo 128512 [list desc "Grinning Face" short "emoticon_gface"]
tcl::dict::set charinfo 128513 [list desc "Grinning Face with Smiling Eyes" short "emoticon_gface_smile_eyes"]
tcl::dict::set charinfo 128514 [list desc "Face with Tears of Joy" short "emoticon_face_tears_joy"]
#todo
tcl::dict::set charinfo 128590 [list desc "Person with Pouting Face" short "emoticon_person_pout"]
# -- -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
tcl::dict::set charsets "Box Drawing" [list ranges [list {start 9472 end 9599}] description "Box Drawing" settype "tcl_supplemented"]
tcl::dict::set charinfo 9472 [list desc "Box Drawings Light Horizontal" short "boxd_lhz"]
tcl::dict::set charinfo 9473 [list desc "Box Drawings Heavy Horizontal" short "boxd_hhz"]
tcl::dict::set charinfo 9474 [list desc "Box Drawings Light Vertical" short "boxd_lv"]
tcl::dict::set charinfo 9475 [list desc "Box Drawings Heavy Vertical" short "boxd_hv"]
tcl::dict::set charinfo 9476 [list desc "Box Drawings Light Triple Dash Horizontal" short "boxd_ltdshhz"]
tcl::dict::set charinfo 9477 [list desc "Box Drawings Heavy Triple Dash Horizontal" short "boxd_htdshhz"]
tcl::dict::set charinfo 9478 [list desc "Box Drawings Light Triple Dash Vertical" short "boxd_ltdshv"]
tcl::dict::set charinfo 9479 [list desc "Box Drawings Heavy Triple Dash Vertical" short "boxd_htdshv"]
tcl::dict::set charinfo 9480 [list desc "Box Drawings Light Quadruple Dash Horizontal" short "boxd_lqdshhz"]
tcl::dict::set charinfo 9481 [list desc "Box Drawings Heavy Quadruple Dash Horizontal" short "boxd_hqdshhz"]
tcl::dict::set charinfo 9482 [list desc "Box Drawings Light Quadruple Dash Vertical" short "boxd_lqdshv"]
tcl::dict::set charinfo 9483 [list desc "Box Drawings Heavy Quadruple Dash Vertical" short "boxd_hqdshv"]
tcl::dict::set charinfo 9484 [list desc "Box Drawings Light Down and Right" short "boxd_ldr"]
tcl::dict::set charinfo 9485 [list desc "Box Drawings Down Light and Right Heavy" short "boxd_dlrh"]
tcl::dict::set charinfo 9486 [list desc "Box Drawings Down Heavy and Right Light" short "boxd_dhrl"]
tcl::dict::set charinfo 9487 [list desc "Box Drawings Heavy Down and Right" short "boxd_hdr"]
tcl::dict::set charinfo 9488 [list desc "Box Drawings Light Down and Left" short "boxd_ldl"]
tcl::dict::set charinfo 9489 [list desc "Box Drawings Down Light and Left Heavy" short "boxd_dllh"]
tcl::dict::set charinfo 9490 [list desc "Box Drawings Down Heavy and Left Light" short "boxd_dhll"]
tcl::dict::set charinfo 9491 [list desc "Box Drawings Heavy Down and Left" short "boxd_hdl"]
tcl::dict::set charinfo 9492 [list desc "Box Drawings Light Up and Right" short "boxd_lur"]
tcl::dict::set charinfo 9493 [list desc "Box Drawings Up Light and Right Heavy" short "boxd_ulrh"]
tcl::dict::set charinfo 9494 [list desc "Box Drawings Up Heavy and Right Light" short "boxd_uhrl"]
tcl::dict::set charinfo 9495 [list desc "Box Drawings Heavy Up and Right" short "boxd_hur"]
tcl::dict::set charinfo 9496 [list desc "Box Drawings Light Up and Left" short "boxd_lul"]
tcl::dict::set charinfo 9497 [list desc "Box Drawings Up Light and Left Heavy" short "boxd_ullh"]
tcl::dict::set charinfo 9498 [list desc "Box Drawings Up Heavy and Left Light" short "boxd_uhll"]
tcl::dict::set charinfo 9499 [list desc "Box Drawings Heavy Up and Left" short "boxd_hul"]
tcl::dict::set charinfo 9500 [list desc "Box Drawings Light Vertical and Right" short "boxd_lvr"]
tcl::dict::set charinfo 9501 [list desc "Box Drawings Vertical Light and Right Heavy" short "boxd_vlrh"]
tcl::dict::set charinfo 9502 [list desc "Box Drawings Up Heavy and Right Down Light" short "boxd_uhrdl"]
tcl::dict::set charinfo 9503 [list desc "Box Drawings Down Heavy and Right Up Light" short "boxd_dhrul"]
tcl::dict::set charinfo 9504 [list desc "Box Drawings Vertical Heavy and Right Light" short "boxd_vhrl"]
tcl::dict::set charinfo 9505 [list desc "Box Drawings Down Light and Right Up Heavy" short "boxd_dlruh"]
tcl::dict::set charinfo 9506 [list desc "Box Drawings Up Light and Right Down Heavy" short "boxd_ulrdh"]
tcl::dict::set charinfo 9507 [list desc "Box Drawings Heavy Vertical and Right" short "boxd_hvr"]
tcl::dict::set charinfo 9508 [list desc "Box Drawings Light Vertical and Left" short "boxd_lvl"]
tcl::dict::set charinfo 9509 [list desc "Box Drawings Vertical Light and Left Heavy" short "boxd_vllh"]
tcl::dict::set charinfo 9510 [list desc "Box Drawings Up Heavy and Let Down Light" short "boxd_uhldl"]
tcl::dict::set charinfo 9511 [list desc "Box Drawings Down Heavy and Left Up Light" short "boxd_dhlul"]
tcl::dict::set charinfo 9512 [list desc "Box Drawings Vertical Heavy and Left Light" short "boxd_vhll"]
tcl::dict::set charinfo 9513 [list desc "Box Drawings Down Light and left Up Heavy" short "boxd_dlluh"]
tcl::dict::set charinfo 9514 [list desc "Box Drawings Up Light and Left Down Heavy" short "boxd_ulldh"]
tcl::dict::set charinfo 9515 [list desc "Box Drawings Heavy Vertical and Left" short "boxd_hvl"]
tcl::dict::set charinfo 9516 [list desc "Box Drawings Light Down and Horizontal" short "boxd_ldhz"]
tcl::dict::set charinfo 9517 [list desc "Box Drawings Left Heavy and Right Down Light" short "boxd_lhrdl"]
tcl::dict::set charinfo 9518 [list desc "Box Drawings Right Heavy and Left Down Light" short "boxd_rhldl"]
tcl::dict::set charinfo 9519 [list desc "Box Drawings Down Light and Horizontal Heavy" short "boxd_dlhzh"]
tcl::dict::set charinfo 9520 [list desc "Box Drawings Down Heavy and Horizontal Light" short "boxd_dhhzl"]
tcl::dict::set charinfo 9521 [list desc "Box Drawings Right Light and Left Down Heavy" short "boxd_rlldh"]
tcl::dict::set charinfo 9522 [list desc "Box Drawings Left Light and Right Down Heavy" short "boxd_llrdh"]
tcl::dict::set charinfo 9523 [list desc "Box Drawings Heavy Down and Horizontal" short "boxd_hdhz"]
tcl::dict::set charinfo 9524 [list desc "Box Drawings Light Up and Horizontal" short "boxd_luhz"]
tcl::dict::set charinfo 9525 [list desc "Box Drawings Left Heavy and Right Up Light" short "boxd_lhrul"]
tcl::dict::set charinfo 9526 [list desc "Box Drawings Right Heavy and Left Up Light" short "boxd_rhlul"]
tcl::dict::set charinfo 9527 [list desc "Box Drawings Up Light and Horizontal Heavy" short "boxd_ulhzh"]
tcl::dict::set charinfo 9528 [list desc "Box Drawings Up Heavy and Horizontal Light" short "boxd_uhhzl"]
tcl::dict::set charinfo 9529 [list desc "Box Drawings Right Light and Left Up Heavy" short "boxd_rlluh"]
tcl::dict::set charinfo 9530 [list desc "Box Drawings Left Light and Right Up Heavy" short "boxd_llruh"]
tcl::dict::set charinfo 9531 [list desc "Box Drawings Heavy Up and Horizontal" short "boxd_huhz"]
tcl::dict::set charinfo 9532 [list desc "Box Drawings Light Vertical and Horizontal" short "boxd_lvhz"]
tcl::dict::set charinfo 9533 [list desc "Box Drawings Left Heavy and Right Vertical Light" short "boxd_lhrvl"]
tcl::dict::set charinfo 9534 [list desc "Box Drawings Right Heavy and Left Vertical Light" short "boxd_rhlvl"]
tcl::dict::set charinfo 9535 [list desc "Box Drawings Vertical Light and Horizontal Heavy" short "boxd_vlhzh"]
tcl::dict::set charinfo 9536 [list desc "Box Drawings Up Heavy and Down Horizontal Light" short "boxd_uhdhzl"]
tcl::dict::set charinfo 9537 [list desc "Box Drawings Down Heavy and Up Horizontal Light" short "boxd_dhuhzl"]
tcl::dict::set charinfo 9538 [list desc "Box Drawings Vertical Heavy and Horizontal Light" short "boxd_vhhzl"]
tcl::dict::set charinfo 9539 [list desc "Box Drawings Left Up Heavy and Right Down Light" short "boxd_luhrdl"]
tcl::dict::set charinfo 9540 [list desc "Box Drawings Right Up Heavy and Left Down Light" short "boxd_ruhldl"]
tcl::dict::set charinfo 9541 [list desc "Box Drawings Left Down Heavy and Right Up Light" short "boxd_ldhrul"]
tcl::dict::set charinfo 9542 [list desc "Box Drawings Right Down Heavy and Left Up Light" short "boxd_rdhlul"]
tcl::dict::set charinfo 9543 [list desc "Box Drawings Down Light and Up Horizontal Heavy" short "boxd_dluhzh"]
tcl::dict::set charinfo 9544 [list desc "Box Drawings Up Light and Down Horizontal Heavy" short "boxd_dldhzh"]
tcl::dict::set charinfo 9545 [list desc "Box Drawings Right Light and Left Vertical Heavy" short "boxd_rllvh"]
tcl::dict::set charinfo 9546 [list desc "Box Drawings Left Light and Right Vertical Heavy" short "boxd_llrvh"]
tcl::dict::set charinfo 9547 [list desc "Box Drawings Heavy Vertical and Horizontal" short "boxd_hvhz"]
tcl::dict::set charinfo 9548 [list desc "Box Drawings Light Double Dash Horizontal" short "boxd_lddshhz"]
tcl::dict::set charinfo 9549 [list desc "Box Drawings Heavy Double Dash Horizontal" short "boxd_hddshhz"]
tcl::dict::set charinfo 9550 [list desc "Box Drawings Light Double Dash Vertical" short "boxd_lddshv"]
tcl::dict::set charinfo 9551 [list desc "Box Drawings Heavy Double Dash Vertical" short "boxd_hddshv"]
tcl::dict::set charinfo 9552 [list desc "Box Drawings Double Horizontal" short "boxd_dhz"]
tcl::dict::set charinfo 9553 [list desc "Box Drawings Double Vertical" short "boxd_dv"]
tcl::dict::set charinfo 9554 [list desc "Box Drawings Down Single and Right Double" short "boxd_dsrd"]
tcl::dict::set charinfo 9555 [list desc "Box Drawings Down Double and Right Single" short "boxd_ddrs"]
tcl::dict::set charinfo 9556 [list desc "Box Drawings Double Down and Right" short "boxd_ddr"]
tcl::dict::set charinfo 9557 [list desc "Box Drawings Down Single and Left Double" short "boxd_dsld"]
tcl::dict::set charinfo 9558 [list desc "Box Drawings Down Double and Left Single" short "boxd_ddls"]
tcl::dict::set charinfo 9559 [list desc "Box Drawings Double Down and Left" short "boxd_ddl"]
tcl::dict::set charinfo 9560 [list desc "Box Drawings Up Single and Right Double" short "boxd_usrd"]
tcl::dict::set charinfo 9561 [list desc "Box Drawings Up Double and Right Single" short "boxd_udrs"]
tcl::dict::set charinfo 9562 [list desc "Box Drawings Double Up and Right" short "boxd_dur"]
tcl::dict::set charinfo 9563 [list desc "Box Drawings Up Single and Left Double" short "boxd_usld"]
tcl::dict::set charinfo 9564 [list desc "Box Drawings Up Double and Left Single" short "boxd_udls"]
tcl::dict::set charinfo 9565 [list desc "Box Drawings Double Up and Left" short "boxd_dul"]
tcl::dict::set charinfo 9566 [list desc "Box Drawings Vertical Single and Right Double" short "boxd_vsrd"]
tcl::dict::set charinfo 9567 [list desc "Box Drawings Vertical Double and Right Single" short "boxd_vdrs"]
tcl::dict::set charinfo 9568 [list desc "Box Drawings Double Vertical and Right" short "boxd_dvr"]
tcl::dict::set charinfo 9569 [list desc "Box Drawings Vertical Single and Left Double" short "boxd_vsld"]
tcl::dict::set charinfo 9570 [list desc "Box Drawings Vertical Double and Left Single" short "boxd_vdls"]
tcl::dict::set charinfo 9571 [list desc "Box Drawings Double Vertical and Left" short "boxd_dvl"]
tcl::dict::set charinfo 9572 [list desc "Box Drawings Down Single and Horizontal Double" short "boxd_dshzd"]
tcl::dict::set charinfo 9573 [list desc "Box Drawings Down Double and Horizontal Single" short "boxd_ddhzs"]
tcl::dict::set charinfo 9574 [list desc "Box Drawings Double Down and Horizontal" short "boxd_ddhz"]
tcl::dict::set charinfo 9575 [list desc "Box Drawings Up Single and Horizontal Double" short "boxd_ushzd"]
tcl::dict::set charinfo 9576 [list desc "Box Drawings Up Double and Horizontal Single" short "boxd_udhzs"]
tcl::dict::set charinfo 9577 [list desc "Box Drawings Double Up and Horizontal" short "boxd_duhz"]
tcl::dict::set charinfo 9578 [list desc "Box Drawings Vertical Single and Horizontal Double" short "boxd_vshzd"]
tcl::dict::set charinfo 9579 [list desc "Box Drawings Vertical Double and Horizontal Single" short "boxd_vdhzs"]
tcl::dict::set charinfo 9580 [list desc "Box Drawings Double Vertical and Horizontal" short "boxd_dvhz"]
tcl::dict::set charinfo 9581 [list desc "Box Drawings Light Arc Down and Right" short "boxd_ladr"]
tcl::dict::set charinfo 9582 [list desc "Box Drawings Light Arc Down and Left" short "boxd_ladl"]
tcl::dict::set charinfo 9583 [list desc "Box Drawings Light Arc Up and Left" short "boxd_laul"]
tcl::dict::set charinfo 9584 [list desc "Box Drawings Light Arc Up and Right" short "boxd_laur"]
tcl::dict::set charinfo 9585 [list desc "Box Drawings Light Diagonal Upper Right To Lower Left" short "boxd_ldgurll"]
tcl::dict::set charinfo 9586 [list desc "Box Drawings Light Diagonal Upper Left To Lower Right" short "boxd_ldgullr"]
tcl::dict::set charinfo 9587 [list desc "Box Drawings Light Diagonal Cross" short "boxd_ldc"]
tcl::dict::set charinfo 9588 [list desc "Box Drawings Light Left" short "boxd_ll"]
tcl::dict::set charinfo 9589 [list desc "Box Drawings Light Up" short "boxd_lu"]
tcl::dict::set charinfo 9590 [list desc "Box Drawings Light Right" short "boxd_lr"]
tcl::dict::set charinfo 9591 [list desc "Box Drawings Light Down" short "boxd_ld"]
tcl::dict::set charinfo 9592 [list desc "Box Drawings Heavy Left" short "boxd_hl"]
tcl::dict::set charinfo 9593 [list desc "Box Drawings Heavy Up" short "boxd_hu"]
tcl::dict::set charinfo 9594 [list desc "Box Drawings Heavy Right" short "boxd_hr"]
tcl::dict::set charinfo 9595 [list desc "Box Drawings Heavy Down" short "boxd_hd"]
tcl::dict::set charinfo 9596 [list desc "Box Drawings Light Left and Heavy Right" short "boxd_llhr"]
tcl::dict::set charinfo 9597 [list desc "Box Drawings Light Up and Heavy Down" short "boxd_luhd"]
tcl::dict::set charinfo 9598 [list desc "Box Drawings Heavy Left and Light Right" short "boxd_hllr"]
tcl::dict::set charinfo 9599 [list desc "Box Drawings Heavy Up and Light Down" short "boxd_huld"]
tcl::dict::set charsets "Halfwidth and Fullwidth Forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants)" settype "tcl_supplemental"]
tcl::dict::set charsets "ascii_fullwidth" [list ranges [list {start 65281 end 65374}] description "Ascii 21 to 7E fullwidth" parentblock "halfwidth_and_fullwidth_forms" settype "other"]
tcl::dict::set charsets "Specials" [list ranges [list {start 65520 end 65535}] description "Specials" settype "tcl_supplemental"]
tcl::dict::set charsets "noncharacters" [list ranges [list {*}{
{start 64976 end 65007 note "BMP FDD0..FDEF"}
{start 65534 end 65535 note "BMP FFFE,FFFF"}
{start 131070 end 131071 note "plane1 1FFFE,1FFFF"}
{start 196606 end 196607 note "plane2 2FFFE,2FFFF"}
{start 262142 end 262143 note "plane3 3FFFE,3FFFF"}
{start 327678 end 327679 note "plane4 4FFFE,4FFFF"}
{start 393214 end 393215 note "plane5 5FFFE,5FFFF"}
{start 458750 end 458751 note "plane6 6FFFE,6FFFF"}
{start 524286 end 524287 note "plane7 7FFFE,7FFFF"}
{start 589822 end 589823 note "plane8 8FFFE,8FFFF"}
{start 655358 end 655359 note "plane9 9FFFE,9FFFF"}
{start 720894 end 720895 note "plane10 AFFFE,AFFFF"}
{start 786430 end 786431 note "plane11 BFFFE,BFFFF"}
{start 851966 end 851967 note "plane12 CFFFE,CFFFF"}
{start 917502 end 917503 note "plane13 DFFFE,DFFFF"}
{start 983038 end 983039 note "plane14 EFFFE,EFFFF"}
{start 1048574 end 1048575 note "plane15 FFFFE,FFFFF"}
{start 1114110 end 1114111 note "plane16 10FFFE,10FFFF"}
}] description "non-characters" settype "tcl_supplemental"
]
#build dicts keyed on short
variable charshort
proc _build_charshort {} {
variable charshort
set charshort [tcl::dict::create]
variable charinfo
tcl::dict::for {k v} $charinfo {
if {[tcl::dict::exists $v short]} {
set sh [tcl::dict::get $v short]
if {[tcl::dict::exists $charshort $sh]} {
puts stderr "_build_charshort WARNING character data load duplicate shortcode '$sh'"
}
tcl::dict::set charshort $sh [format %c $k]
}
}
return [tcl::dict::size $charshort]
}
_build_charshort
variable charset_extents_startpoints ;#stores endpoints associated with each startpoint - but named after key which is startpoint.
variable charset_extents_endpoints ;#stores startpoints assoicated with each endpoint - but named after key which is endpoint.
variable charset_extents_rangenames ;# dict keyed on start,end pointing to list of 2-tuples {setname rangeindex_within_set}
#build 2 indexes for each range.(charsets are not just unicode blocks so can have multiple ranges)
#Note that a range could be as small as a single char (startpoint = endpoint) so there can be many ranges with same start and end if charsets use some of the same subsets.
#as each charset_extents_startpoins,charset_extents_endpoints is built - the associated range name and index is appended to the rangenames dict
#startpoints - key is startpoint of a range, value is list of endpoints one for each range starting at this startpoint-key
#endpoints - key is endpoint of a range, value is list of startpoints one for each range ending at this endpoint-key
proc _build_charset_extents {} {
variable charsets
variable charset_extents_startpoints
variable charset_extents_endpoints
variable charset_extents_rangenames
set charset_extents_startpoints [tcl::dict::create]
set charset_extents_endpoints [tcl::dict::create]
set charset_extents_rangenames [tcl::dict::create]
tcl::dict::for {setname setinfo} $charsets {
set ranges [tcl::dict::get $setinfo ranges]
if {[tcl::dict::get $setinfo settype] eq "block"} {
#unicode block must have a single range
#we consider a char a member of the block even if unassigned/reserved (as per unicode documentation)
set start [tcl::dict::get [lindex $ranges 0] start]
set end [tcl::dict::get [lindex $ranges 0] end]
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_endpoints $end $start
}
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname 1]
} else {
#multirange sets/scripts. have holes. Char not a member if it's not explicitly in a defined range.
#They should be in order within a set - but we don't assume so
set r 1
foreach range $ranges {
set start [tcl::dict::get $range start]
set end [tcl::dict::get $range end]
if {![tcl::dict::exists $charset_extents_startpoints $start] || $end ni [tcl::dict::get $charset_extents_startpoints $start]} {
#assertion if end wasn't in startpoits list - then start won't be in endpoints list
tcl::dict::lappend charset_extents_startpoints $start $end
tcl::dict::lappend charset_extents_endpoints $end $start
}
tcl::dict::lappend charset_extents_rangenames ${start},${end} [list $setname $r]
incr r
}
}
}
#maintain in sorted order
#-stride is available in lsort even at tcl8.6 - but not in lsearch
set charset_extents_startpoints [lsort -stride 2 -integer $charset_extents_startpoints]
set charset_extents_endpoints [lsort -stride 2 -integer $charset_extents_endpoints]
#no need to sort charset_extents_rangenames - lookup only done using dict methods
return [tcl::dict::size $charset_extents_startpoints]
}
_build_charset_extents ;#rebuilds for all charsets
#nerdfonts are within the Private use E000 - F8FF range
proc load_nerdfonts {} {
variable charsets
variable charinfo
package require fileutil
set ver [package provide punk::char]
if {$ver ne ""} {
set ifneeded [package ifneeded punk::char [package provide punk::char]]
#puts stderr "punk::char ifneeded script: $ifneeded"
lassign [split $ifneeded ";"] _ sourceinfo
set basedir [file dirname [lindex $sourceinfo end]]
} else {
#review - will only work at package load time
set scr [info script]
if {$scr eq ""} {
error "load_nerdfonts unable to determine package folder"
}
set basedir [file dirname [info script]]
}
set pkg_data_dir [file join $basedir char]
set fname [file join $pkg_data_dir nerd-fonts-glyph-list.txt]
if {[file exists $fname]} {
#puts stderr "load_nerdfonts loading $fname"
set data [fileutil::cat -translation binary $fname]
set short_seen [tcl::dict::create]
set current_set_range [tcl::dict::create]
set filesets_loading [list]
foreach ln [split $data \n] {
set ln [tcl::string::trim $ln]
if {$ln eq ""} {continue}
set desc [lassign $ln hex rawsetname]
set hexnum 0x$hex
set dec [expr $hexnum]
set setname "nf_$rawsetname" ;#Ensure nerdfont set names are prefixed.
if {$setname ni $filesets_loading} {
if {![tcl::dict::exists $charsets $setname]} {
#set exists - but not in our filesets_loading list - therefore this set has been previously loaded, so clear old data first
dict unset charset $setname
}
set newrange [list start $dec end $dec]
tcl::dict::set current_set_range $setname $newrange
tcl::dict::set charsets $setname [list ranges [list $newrange] description "nerd fonts $rawsetname" settype "nerdfonts privateuse"]
lappend filesets_loading $setname
}
#expects ordered glyph list
set existing_range [tcl::dict::get $current_set_range $setname]
set existing_end [tcl::dict::get $existing_range end]
if {$dec - $existing_end == 1} {
#part of current range
tcl::dict::set current_set_range $setname end $dec
#overwrite last ranges element
set rangelist [lrange [tcl::dict::get $charsets $setname ranges] 0 end-1]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
} else {
#new range for set
tcl::dict::set current_set_range $setname start $dec
tcl::dict::set current_set_range $setname end $dec
set rangelist [tcl::dict::get $charsets $setname ranges]
lappend rangelist [tcl::dict::get $current_set_range $setname]
tcl::dict::set charsets $setname ranges $rangelist
}
if {![tcl::dict::exists $charinfo $dec]} {
# -- ---
#review
set map [list beaufort bf gibbous gb crescent cr thunderstorm tstorm thermometer thermom]
lappend map {*}[list directory dir creativecommons ccom creative_commons ccom forwardslash fs]
lappend map {*}[list multimedia mm multiple multi outline outl language lang]
lappend map {*}[list odnoklassniki okru]
# -- ---
#consider other ways to unambiguously shorten names?
#normalize nf_fa & nf_fa 'o' element to 'outl' so outlines can be searched across sets more easily (o not necessarily at last position)
set normdesc [list]
foreach el $desc {
if {$el eq "o"} {
set el "outl"
}
lappend normdesc $el
}
set joined_desc [join $normdesc _]
#map after join so we can normalize some underscored elements e.g creativecommons & creative_commons
set mapped_desc [tcl::string::map $map $joined_desc]
set s nf_${rawsetname}_$mapped_desc
if {![tcl::dict::exists $short_seen $s]} {
tcl::dict::set short_seen $s {}
} else {
#duplicate in the data file (e.g 2023 weather night alt rain mix)
set s ${s}_$hex
}
tcl::dict::set charinfo $dec [list desc "$desc" short $s]
}
}
_build_charshort
_build_charset_extents
} else {
puts stderr "unable to find glyph file. Tried $fname"
}
}
proc package_base {} {
#assume punk::char is in .tm form and we can use the package provide statement to determine base location
#review
set pkgver [package present punk::char]
set pkginfo [package ifneeded punk::char $pkgver]
set tmfile [lindex $pkginfo end]
set pkg_base [file dirname $tmfile]
return $pkg_base
}
tcl::namespace::eval internal {
proc unicode_folder {} {
set parent [file join [punk::char::package_base] char]
set candidates [glob -nocomplain -type d -dir $parent -tail unicode*]
set candidates [lsort -increasing $candidates] ;#review - dictionary sort - how are unicode versions ranked/compared??
if {![llength $candidates]} {
error "Failed to find unicode data folder in folder '$parent'"
}
set folder [file join $parent [lindex $candidates end]]
return $folder
}
proc dict_getdef {dictValue args} {
if {[llength $args] < 2} {
error {wrong # args: should be "dict_getdef dictValue ?key ...? key default"}
}
set keys [lrange $args 0 end-1]
if {[tcl::dict::exists $dictValue {*}$keys]} {
return [tcl::dict::get $dictValue {*}$keys]
} else {
return [lindex $args end]
}
}
}
#charsets structure
#tcl::dict::set charsets "halfwidth_and_fullwidth_forms" [list ranges [list {start 65280 end 65519}] description "Halfwidth and Fullwidth Forms (variants) settype block"]
#unicode Blocks.txt
#load the defined blocks into 'charsets' and mark as type 'block'. Unicode blocks have only one range - and don't overlap.
#We don't treat unassigned/reserved codes within a block specially at this stage - ie we will not chop a block into subranges on that basis.
#unassigned code points should get certain default properties (e.g bidirectionality ) according to their block - so it makes sense to treat them as belonging to the block.
#They also get the general property of Cn (Other,not assigned or Other,reserved) and a "Basic Type" of Noncharacter or Reserved
proc load_unicode_blocks {} {
#sample data line
#0000..007F; Basic Latin
variable charsets
set file [file join [internal::unicode_folder] Blocks.txt]
if {![file exists $file]} {
error "Unicode Blocks.txt file not found at path '$file'"
}
puts "ok.. loading"
set fd [open $file r]
chan configure $fd -translation binary
set data [read $fd]
close $fd
set block_count 0
foreach ln [split $data \n] {
set ln [tcl::string::trim $ln]
if {[tcl::string::match #* $ln]} {
continue
}
if {[set pcolon [tcl::string::first ";" $ln]] > 0} {
set lhs [tcl::string::trim [tcl::string::range $ln 0 $pcolon-1]]
set name [tcl::string::trim [tcl::string::range $ln $pcolon+1 end]]
set lhsparts [split $lhs .]
set start [lindex $lhsparts 0]
set end [lindex $lhsparts end]
#puts "$start -> $end '$name'"
set decimal_start [expr {"0x$start"}]
set decimal_end [expr {"0x$end"}]
tcl::dict::set charsets $name [list ranges [list [list start $decimal_start end $decimal_end note "unicode block $lhs"]] description "" settype block]
incr block_count
}
}
_build_charset_extents
return $block_count
}
#unicode scripts
#unicode UnicodeData.txt
#https://www.unicode.org/reports/tr44/#Property_Values
#unicode EastAsianWidth.txt
#classify width of character - which is contextual in some cases
#####
#Review - this is initial naive assumption that should get us mostly what we want for layout purposes in a utf-8-centric world.
#We will just load the values and treat H,N,Na as 1-wide and A,F,W as 2-wide for functions such as char::string_width on the basis that those using legacy sets can query the property and make their own determinations in those contexts.
####
# -- ---
#A = Ambiguous - All characters that can be sometimes wide and sometimes narrow. (wide in east asian legacy sets, narrow in non-east asian usage) (private use chars considered ambiguous)
#F = East Asian Full-width
#H = East Asian Half-width
#N = Not east Asian (Neutral) - all other characters. (do not occur in legacy East Asian character sets) - treated like Na
#Na = East Asian Narrow - all other characters that are always narrow and have explicit full-width counterparts (e.g includes all of ASCII)
#W = East Asian Wide - all other characters that are always wide (Occur only in the context of Eas Asian Typography)
# -- ---
proc charshort {shortname} {
variable charshort
return [tcl::dict::get $charshort $shortname]
}
proc box_drawing {args} {
return [charset "Box Drawing" {*}$args]
}
proc box_drawing_dict {} {
return [charset_dict "Box Drawing"]
}
proc char_hex {char} {
return [format %08x [scan $char %c]]
}
proc char_info_hex {hex args} {
set hex [tcl::string::map [list _ ""] $hex]
if {[tcl::string::is xdigit -strict $hex]} {
#has no leading 0x
set dec [expr {"0x$hex"}]
} else {
set dec [expr {$hex}]
}
return [char_info_dec $dec {*}$args]
}
proc char_info {char args} {
#Note - on some versions of Tcl -e.g 8.6 use could supply something like \U1f600 (smiley icon) but we receive fffd (replacement special)
#there is no way to detect what the user intended ie we can't distinguish if they actually typed \UFFFD
#we can test if such mapping happens in general - and warn if codepoint is FFFD in the result dict
set returninfo [tcl::dict::create]
if {[tcl::string::equal \UFFFD $char] && [tcl::string::equal \U1F600 \UFFFD]} {
tcl::dict::set returninfo WARNING "this tcl maps multiple to FFFD"
}
lassign [scan $char %c%s] dec_char remainder
if {[tcl::string::length $remainder]} {
error "char_info requires a single character"
}
set result [tcl::dict::merge $returninfo [char_info_dec $dec_char {*}$args]]
}
proc char_info_dec {dec args} {
set dec_char [expr {$dec}]
set opts [tcl::dict::create {*}{
-fields {default}
-except {}
}]
#testwidth is so named because it peforms an actual test on the console using ansi escapes - and the name gives a hint that it is a little slow
set known_fields [list all default dec hex desc short testwidth char memberof] ;#maint fields from charinfo 'desc' 'short'
#todo - unicode properties
# tclwhitespace (different to unicode concept of whitespace. review )
foreach {k v} $args {
switch -- $k {
-fields - -except {
tcl::dict::set opts $k $v
}
default {
error "char_info unrecognised option '$k'. Known options:'[tcl::dict::keys $opts]' known_fields: $known_fields usage: char_info <char> ?-fields {<fieldnames>}? ?-except {<fieldnames>}?"
}
}
}
# -- --- --- --- --- --- --- --- --- --- --- ---
set opt_fields [tcl::dict::get $opts -fields]
set opt_except [tcl::dict::get $opts -except]
# -- --- --- --- --- --- --- --- --- --- --- ---
set initial_fields [list]
if {"default" in $opt_fields} {
set initial_fields $known_fields
if {"testwidth" ni $opt_fields} {
if {"testwidth" ni $opt_except} {
lappend opt_except testwidth
}
}
if {"char" ni $opt_fields} {
if {"char" ni $opt_except} {
lappend opt_except char
}
}
} elseif {"all" in $opt_fields} {
set initial_fields $known_fields
} else {
foreach f $opt_fields {
if {$f in $known_fields} {
lappend initial_fields $f
} else {
error "char_info unknown field name: '$f' known fields: '$known_fields'"
}
}
}
foreach e $opt_except {
if {$e ni $known_fields} {
error "char_info unknown field name $e in -except. known fields: '$known_fields'"
}
}
set fields [list]
foreach f $initial_fields {
if {$f ne "all" && $f ni $opt_except} {
lappend fields $f
}
}
if {![llength $fields]} {
return
}
variable charinfo
variable charsets
set hex_char [format %04x $dec_char]
set returninfo [tcl::dict::create]
foreach f $fields {
switch -- $f {
dec {
tcl::dict::set returninfo dec $dec_char
}
hex {
tcl::dict::set returninfo hex $hex_char
}
desc {
if {[tcl::dict::exists $charinfo $dec_char desc]} {
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char desc]
} else {
tcl::dict::set returninfo desc ""
}
}
short {
if {[tcl::dict::exists $charinfo $dec_char short]} {
tcl::dict::set returninfo desc [tcl::dict::get $charinfo $dec_char short]
} else {
tcl::dict::set returninfo short ""
}
}
testwidth {
#todo - expectedwidth - lookup the printing width it is *supposed* to have from unicode tables
#testwidth is one of the main ones likely to be worthwhile excluding as querying the console via ansi takes time
set existing_testwidth ""
if {[tcl::dict::exists $charinfo $dec_char testwidth]} {
set existing_testwidth [tcl::dict::get $charinfo $dec_char testwidth]
}
if {$existing_testwidth eq ""} {
#no cached data - do expensive cursor-position test (Note this might not be 100% reliable - terminals lie e.g if ansi escape sequence has set to wide printing.)
set char [format %c $dec_char]
set chwidth [char_info_testwidth $char]
tcl::dict::set returninfo testwidth $chwidth
#cache it. todo - -verify flag to force recalc in case font/terminal changed in some way?
tcl::dict::set charinfo $dec_char testwidth $chwidth
} else {
tcl::dict::set returninfo testwidth $existing_testwidth
}
}
char {
set char [format %c $dec_char]
tcl::dict::set returninfo char $char
}
memberof {
#memberof takes in the order of a few hundred microseconds if a simple scan of all ranges is taken - possibly worthwhile caching/optimising
#note that memberof is not just unicode blocks - but scripts and other non-contiguous sets consisting of multiple ranges - some of which may include ranges of a single character. (e.g WGL4)
#This means there probably isn't a really efficient means of calculating membership other than scanning all the defined ranges.
#We could potentially populate it using a side-thread - but it seems reasonable to just cache result after first use here.
#some efficiency could also be gained by pre-calculating the extents for each charset which isn't a simple unicode block. (and perhaps sorting by max char)
set memberof [list]
tcl::dict::for {setname setinfo} $charsets {
foreach r [tcl::dict::get $setinfo ranges] {
set s [tcl::dict::get $r start]
set e [tcl::dict::get $r end]
if {$dec_char >= $s && $dec_char <= $e} {
lappend memberof $setname
break
}
}
}
tcl::dict::set returninfo memberof $memberof
}
}
}
return $returninfo
}
proc _char_info_dec_memberof_scan {dec} {
variable charsets
set memberof [list]
tcl::dict::for {setname setinfo} $charsets {
foreach r [tcl::dict::get $setinfo ranges] {
set s [tcl::dict::get $r start]
set e [tcl::dict::get $r end]
if {$dec >= $s && $dec <= $e} {
lappend memberof $setname
break
}
}
}
return $memberof
}
proc range_split_info {dec} {
variable charset_extents_startpoints
variable charset_extents_endpoints
set skeys [tcl::dict::keys $charset_extents_startpoints]
set ekeys [tcl::dict::keys $charset_extents_endpoints]
set splen [tcl::dict::size $charset_extents_startpoints]
set eplen [tcl::dict::size $charset_extents_endpoints]
set s [lsearch -bisect -integer $skeys $dec]
set s_at_or_below [lrange $skeys 0 $s]
set e_of_s [list]
foreach sk $s_at_or_below {
lappend e_of_s {*}[tcl::dict::get $charset_extents_startpoints $sk]
}
set e_of_s [lsort -integer $e_of_s]
set splitposn [lsearch -bisect -integer $e_of_s $dec]
if {[lindex $e_of_s $splitposn] < $dec} {incr splitposn}
#set lhs_endpoints_to_check [expr {[llength $e_of_s] - $splitposn}]
set reduced_endpoints [lrange $e_of_s $splitposn end]
set sps [list]
foreach ep $reduced_endpoints {
lappend sps {*}[tcl::dict::get $charset_extents_endpoints $ep]
}
set e [lsearch -bisect -integer $ekeys $dec]
if {$e >= 0} {
set e_at_or_above [lrange $ekeys $e end]
set s_of_e [list]
foreach ek $e_at_or_above {
lappend s_of_e {*}[tcl::dict::get $charset_extents_endpoints $ek]
}
set startpoints_of_above [llength $s_of_e]
set splitposn [lsearch -bisect -integer $s_of_e $dec]
set reduced_startpoints [lrange $s_of_e 0 $splitposn]
set eps [list]
foreach sp $reduced_startpoints {
lappend eps {*}[tcl::dict::get $charset_extents_startpoints $sp]
}
} else {
set s_of_e [list]
set reduced_startpoints [list]
set eps [list]
}
return [tcl::dict::create startpoints $splen endpoints $eplen midpoint [expr {floor($eplen/2)}] posn $e lhs_endpoints_to_check "[llength $reduced_endpoints]/[llength $e_of_s]=[llength $sps]ranges" rhs_startpoints_to_check "[llength $reduced_startpoints]/[llength $s_of_e]=[llength $eps]"]
}
#for just a few extra sets such as wgl4 and unicode blocks loaded - this gives e.g 5x + better performance than the simple search above, and up to twice as slow for tcl 8.6
#performance biased towards lower numbered characters (which is not too bad in the context of unicode)
#todo - could be tuned to perform better at end by assuming a fairly even distribution of ranges - and so searching startpoint ranges first for items left of middle, endpoint ranges first for items right of middle
#review with scripts loaded and more defined ranges..
#This algorithm supports arbitrary overlapping ranges and ranges with same start & endpoints
#Should be much better than O(n) for n sets except for pathological case of member of all or nearly-all intervals ?
#review - compare with 'interval tree' algorithms.
proc char_info_dec_memberof {dec} {
variable charset_extents_startpoints
variable charset_extents_endpoints
variable charset_extents_rangenames
if {[package vcompare [info tclversion] 8.7a5] >= 0} {
#algorithm should theoretically be a little better with -stride
set last_smaller_or_equal_startposn [lsearch -stride 2 -bisect -integer $charset_extents_startpoints $dec]
set sets_starting_below [lrange $charset_extents_startpoints 0 $last_smaller_or_equal_startposn+1] ;#+1 to include 2nd element of stridden pair
set endpoints_of_starting_below [lsort -integer [concat {*}[tcl::dict::values $sets_starting_below]]]
} else {
#no -stride available
set startkeys [tcl::dict::keys $charset_extents_startpoints]
set last_smaller_or_equal_startkeyposn [lsearch -bisect -integer $startkeys $dec] ;#assert will always return one of the keys if number >=0 supplied (last key if > all)
#set startkey_found [lindex $startkeys $last_smaller_or_equal_startkeyposn]
set start_below_keys [lrange $startkeys 0 $last_smaller_or_equal_startkeyposn] ;#These are the keys of sets which start at or below dec
#puts "start_below_keys: '$start_below_keys'"
set endpoints_of_starting_below [list]
foreach belowkey $start_below_keys {
lappend endpoints_of_starting_below {*}[tcl::dict::get $charset_extents_startpoints $belowkey]
}
#set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below[unset endpoints_of_starting_below]]
set endpoints_of_starting_below [lsort -integer $endpoints_of_starting_below]
}
set splitposn [lsearch -bisect -integer $endpoints_of_starting_below $dec] ;#splitposn = last smaller or equal endposn
if {[lindex $endpoints_of_starting_below $splitposn] < $dec} { incr splitposn}
set reduced_opposite_limit [lrange $endpoints_of_starting_below $splitposn end]
################
#note each endpoint points to multiple startpoints which may still include some that are not in range. (e.g range y can share endpoint with x that starts in-range - but y starts above character )
# x1 x2
# y1 y2
# c
################
#we have reduced our set of endpoints sufficiently (to those at or above dec) to run through and test each startpoint
set ranges [list]
foreach ep $reduced_opposite_limit {
foreach s [tcl::dict::get $charset_extents_endpoints $ep] {
if {$s <= $dec} {
lappend ranges [tcl::dict::get $charset_extents_rangenames $s,$ep]
}
}
}
return $ranges
}
#with glob searching of description and short
proc char_range_dict {start end args} {
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
error "char_range_dict error start and end must be integers"
}
set and_globs [list]
if {![llength $args]} {
set args [list *]
}
foreach glob $args {
if {![regexp {[*?]} $glob]} {
lappend and_globs "*$glob*"
} else {
lappend and_globs $glob
}
}
variable charinfo
set cdict [tcl::dict::create]
set start [expr {$start}] ;#force string rep to decimal - otherwise first use of i as string could be hex or other rep whilst other i values will be decimal string rep due to incr
for {set i $start} {$i <= $end} {incr i} {
set hx [format %04x $i]
set ch [format %c $i]
if {[tcl::dict::exists $charinfo $i desc]} {
set d [tcl::dict::get $charinfo $i desc]
} else {
set d ""
}
if {[tcl::dict::exists $charinfo $i short]} {
set s [tcl::dict::get $charinfo $i short]
} else {
set s ""
}
set matchcount 0
foreach glob $and_globs {
if {[tcl::string::match -nocase $glob $s] || [tcl::string::match -nocase $glob $d]} {
incr matchcount
}
}
if {$matchcount == [llength $and_globs]} {
if {[tcl::dict::exists $charinfo $i]} {
tcl::dict::set cdict $hx [tcl::dict::merge [tcl::dict::create dec $i hex $hx char $ch] [tcl::dict::get $charinfo $i]]
} else {
tcl::dict::set cdict $hx [list dec $i hex $hx char $ch desc $d short $s]
}
}
}
return $cdict
}
#with glob searches of desc and short
proc char_range {start end args} {
package require overtype
if {![tcl::string::is integer -strict $start] || ![tcl::string::is integer -strict $end]} {
error "char_range error start and end must be integers"
}
set charset_dict [char_range_dict $start $end {*}$args]
set out ""
set col3 [tcl::string::repeat " " 12]
tcl::dict::for {k inf} $charset_dict {
set s [internal::dict_getdef $inf short ""]
set d [internal::dict_getdef $inf desc ""]
set s_col [overtype::left $col3 $s]
append out "$k [tcl::dict::get $inf dec] [tcl::dict::get $inf char] $s_col $d" \n
}
return $out
}
#non-overlapping unicode blocks
proc char_blocks {{name_or_glob *}} {
variable charsets
#todo - more efficient datastructures?
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
if {[tcl::dict::get $charsets $name_or_glob settype] eq "block"} {
return [tcl::dict::create $name_or_glob [tcl::dict::get $charsets $name_or_glob]]
}
}
#no exact match - try case insensitive..
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
if {$name ne ""} {
if {[tcl::dict::get $charsets $name settype] eq "block"} {
return [tcl::dict::create $name [tcl::dict::get $charsets $name]]
}
}
} else {
#build a subset
set charsets_block [tcl::dict::create]
tcl::dict::for {k v} $charsets {
if {[tcl::string::match -nocase $name_or_glob $k]} {
if {[tcl::dict::get $v settype] eq "block"} {
tcl::dict::set charsets_block $k $v
}
}
}
return $charsets_block
}
}
proc charset_names {{name_or_glob *}} {
variable charsets
if {![regexp {[?*]} $name_or_glob]} {
#no glob - just retrieve it
if {[tcl::dict::exists $charsets $name_or_glob]} {
return [list $name_or_glob]
}
#no exact match - try case insensitive..
set name [lsearch -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]
if {$name ne ""} {
return [list $name]
}
} else {
if {$name_or_glob eq "*"} {
return [lsort [tcl::dict::keys $charsets]]
}
#tcl::dict::keys $dict <pattern> doesn't have option for case insensitive searches
return [lsort [lsearch -all -inline -nocase [tcl::dict::keys $charsets] $name_or_glob]]
}
}
#deprecated
#major named sets such as unicode blocks, scripts, and other sets such as microsoft WGL4
#case insensitive search - possibly with *basic* globs (? and * only - not lb rb)
proc charset_names2 {{namesearch *}} {
variable charsets
#dictionary sorting of the keys is slow! - we should obviously store it in sorted order instead of sorting entire list on retrieval - or just sort results
#set sortedkeys [lsort -increasing -dictionary [tcl::dict::keys $charsets]] ;#NOTE must use -dictionary to use -sorted flag below
set sortedkeys [lsort -increasing [tcl::dict::keys $charsets]]
if {$namesearch eq "*"} {
return $sortedkeys
}
if {[regexp {[?*]} $namesearch]} {
#name glob search
return [lsearch -all -inline -nocase $sortedkeys $namesearch] ;#no point using -sorted flag when -all is used
} else {
#return [lsearch -sorted -inline -nocase $sortedkeys $namesearch] ;#no globs - bails out earlier if -sorted?
return [lsearch -inline -nocase $sortedkeys $namesearch] ;#no globs
}
}
proc charsets {{namesearch *}} {
package require textblock
variable charsets
set charset_names [charset_names $namesearch]
set settype_list [list]
foreach setname $charset_names {
lappend settype_list [tcl::dict::get $charsets $setname settype]
}
#set charset_names [linsert $charset_names 0 "Set Name"]
ledit charset_names 0 -1 "Set Name"
#set settype_list [linsert $settype_list 0 "Set Type"]
ledit settype_list 0 -1 "Set Type"
return [textblock::join -- [list_as_lines -- $charset_names] " " [list_as_lines $settype_list]]
}
proc charset_defget {exactname} {
variable charsets
return [tcl::dict::get $charsets $exactname]
}
proc charset_defs {charsetname} {
variable charsets
set matches [charset_names $charsetname]
set def_list [list]
foreach setname $matches {
lappend def_list [tcl::dict::create $setname [tcl::dict::get $charsets $setname]]
}
return [join $def_list \n]
}
proc charset_dictget {exactname} {
variable charsets
set setinfo [tcl::dict::get $charsets $exactname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end]]
}
return $charset_dict
}
proc charset_dicts {searchname} {
variable charsets
set matches [charset_names $searchname]
if {![llength $matches]} {
error "No charset found matching name '$searchname' - use 'charset_names' to get list"
}
set dict_list [list]
foreach m $matches {
lappend dict_list [tcl::dict::create $m [charset_dictget $m]]
}
#return $dict_list
return [join $dict_list \n]
}
proc charset_page {namesearch args} {
_charset_page_search $namesearch $args ;#pass args to descsearch argument
}
proc _charset_page_search {namesearch search_this_and_that args} {
variable charsets
variable charinfo
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "charset_page no charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set defaults [tcl::dict::create {*}{
-ansi 0
-lined 1
}]
set opts [tcl::dict::merge $defaults $args]
# -- --- --- ---
set opt_ansi [tcl::dict::get $opts -ansi]
set opt_lined [tcl::dict::get $opts -lined]
# -- --- --- ---
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
if {$opt_ansi} {
set a1 [a BLACK white bold]
set a2 [a]
} else {
set a1 ""
set a2 ""
}
set cols 16
set prefix " "
append out $prefix
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
if {![tcl::dict::size $charset_dict]} {
continue
}
set i 1
append out \n $prefix $charsetname
append out \n
set marker_line $prefix
set line $prefix
tcl::dict::for {hex inf} $charset_dict {
set ch [tcl::dict::get $inf char]
set twidth ""
set dec [expr {"0x$hex"}]
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
#set width [ansifreestring_width $ch] ;#based on unicode props
set width [grapheme_width_cached $ch]
} else {
set width $twidth
}
if {$width == 0} {
set marker " "
if {[regexp $re_diacritics $ch]} {
#attempt to combine with space to get 3-wide displayv with diacritic showing at left space
#todo - dualchar diacritics?
set displayv " $ch "
} else {
set displayv " "
}
} elseif {$width == 1} {
set marker "_ "
set displayv "${a1}$ch${a2} "
} else {
#presumed 2
set marker "__ "
set displayv "${a1}$ch${a2} "
}
set hexlen [tcl::string::length $hex]
append marker_line "[tcl::string::repeat " " $hexlen] $marker"
append line "$hex $displayv"
if {$i == [tcl::dict::size $charset_dict] || $i % $cols == 0} {
if {$opt_lined} {
append out $marker_line \n
}
append out $line \n
set marker_line $prefix
set line $prefix
#set out [tcl::string::range $out 0 end-2]
#append out \n " "
}
incr i
}
}
set out [tcl::string::trimright $out " "]
return $out
}
#allows search on both name and an anded list of globs to be applied to description & short
proc charset {namesearch args} {
package require overtype
variable charsets
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set search_this_and_that $args
set out ""
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
set col_items_short [list]
set col_items_desc [list]
tcl::dict::for {k inf} $charset_dict {
lappend col_items_desc [internal::dict_getdef $inf desc ""]
lappend col_items_short [internal::dict_getdef $inf short ""]
}
if {[llength $col_items_desc]} {
set widest3 [tcl::mathfunc::max {*}[lmap v $col_items_short {tcl::string::length $v}]]
if {$widest3 == 0} {
set col3 " "
} else {
set col3 [tcl::string::repeat " " $widest3]
}
tcl::dict::for {k inf} $charset_dict {
set s [internal::dict_getdef $inf short ""]
set d [internal::dict_getdef $inf desc ""]
set s_col [overtype::left $col3 $s]
append out "$k [tcl::dict::get $inf char] $s_col $d" \n
}
}
}
return $out
}
#use console cursor movements to test and cache the column-width of each char in the set of characters returned by the search criteria
proc charset_calibrate {namesearch args} {
variable charsets
variable charinfo
set matched_names [charset_names $namesearch]
if {![llength $matched_names]} {
error "No charset matched pattern '$namesearch' - use 'charset_names' to get list"
}
set search_this_and_that $args
set charcount 0
set width_results [tcl::dict::create]
puts stdout "calibrating using terminal cursor movements.."
foreach charsetname $matched_names {
if {[llength $search_this_and_that]} {
set setinfo [tcl::dict::get $charsets $charsetname]
set ranges [tcl::dict::get $setinfo ranges]
set charset_dict [tcl::dict::create]
foreach r $ranges {
set start [tcl::dict::get $r start]
set end [tcl::dict::get $r end]
set charset_dict [tcl::dict::merge $charset_dict [char_range_dict $start $end {*}$search_this_and_that]]
}
} else {
set charset_dict [charset_dictget $charsetname]
}
if {![tcl::dict::size $charset_dict]} {
continue
}
tcl::dict::for {hex inf} $charset_dict {
set ch [format %c 0x$hex]
set twidth ""
set dec [expr {"0x$hex"}]
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
#puts -nonewline stdout "." ;#this
set width [char_info_testwidth $ch] ;#based on console test rather than unicode props
tcl::dict::set charinfo $dec testwidth $width
} else {
set width $twidth
}
tcl::dict::incr width_results $width
incr charcount
}
}
puts stdout "\ncalibration done - results cached in charinfo dictionary"
return [tcl::dict::create charcount $charcount widths $width_results]
}
#maint warning - also in overtype!
#intended for single grapheme - but will work for multiple
#cannot contain ansi or newlines
#(a cache of ansifreestring_width calls - as these are quite regex heavy)
#review - effective memory leak on longrunning programs if never cleared
#tradeoff in fragmenting cache and reducing efficiency vs ability to clear in a scoped manner
proc grapheme_width_cached {ch {key ""}} {
variable grapheme_widths
#if key eq "*" - we won't be able to clear that cache individually. Perhaps that's ok
if {[tcl::dict::exists $grapheme_widths $key $ch]} {
return [tcl::dict::get $grapheme_widths $key $ch]
}
set width [punk::char::ansifreestring_width $ch] ;#review - can we provide faster version if we know it's a single grapheme rather than a string? (grapheme is still a string as it may have combiners/diacritics)
tcl::dict::set grapheme_widths $key $ch $width
return $width
}
proc grapheme_width_cache_clear {key} {
variable grapheme_widths
if {$key eq "*} {
set grapheme_widths [tcl::dict::create]
} else {
tcl::dict::unset grapheme_widths $key
}
return
}
#no char_width - use grapheme_width terminology to be clearer
proc grapheme_width {char} {
error "grapheme_width unimplemented - use ansifreestring_width"
}
#return N Na W etc from unicode data
#review
proc char_uc_width_prop {char} {
error "char_uc_width unimplemented try textutil::wcswidth_type"
}
#todo - provide a grapheme_width function that is optimised for speed
proc string_width {text} {
#burn approx 2uS (2024) checking for ansi codes - not just SGR
if {[punk::ansi::ta::detect $text]} {
puts stderr "string_width detected ANSI!"
}
if {[tcl::string::last \n $text] >= 0} {
error "string_width accepts only a single line"
}
#tailcall ansifreestring_width $text
ansifreestring_width $text
}
#todo - consider disallowing/erroring out when \r \n in string?
# - tab/vtab?
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#REVIEW - when we cater for grapheme clusters - we can't just split the string at arbitrary points like this!!
set chunksize 2000
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
set width 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach c $codes {
if {$c <= 255} {
if {$c == 9 || ($c >= 31 && $c != 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
incr width
}
} elseif {$c < 917504 || $c > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $c]
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
incr startidx $chunksize
incr endidx $chunksize
}
return $width
}
proc wcswidth_clustered {string} {
package require tk
set width 0
set i 0
if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
return [punk::char::wcswidth_unclustered $string] ;#still use our wcswidth to account for non-printable ascii
}
while {$i < [tcl::string::length $string]} {
set aftercluster [tk::endOfCluster $string $i]
set g [string range $string $i $aftercluster-1]
if {$aftercluster > ($i + 1)} {
#review - proper way to determine screen width (columns occupied) of a cluster??
#according to this:
#https://lib.rs/crates/unicode-display-width
#for each grapheme - if any of the code points in the cluster have an east asian width of 2,
#The entire grapheme width is 2 regardless of how many code points constitute the grapheme
set gw 1
foreach ch [split $g ""] {
if {[punk::char::wcswidth_single $ch] == 2} {
set gw 2
break
}
}
incr width $gw
#if {[string first \u200d $g] >=0} {
# incr width 2
#} else {
# #other joiners???
# incr width [wcswidth_unclustered $g]
#}
} else {
incr width [wcswidth_unclustered $g]
}
set i $aftercluster
}
return $width
}
proc wcswidth_single {char} {
scan $char %c dec
if {$dec <= 255} {
if {$dec == 9} {
#tab always represented by at least one char in terminal etc.
#caller will need to process tabs themselves to determine extra width applicable to their circumstance.
return 1
}
if {($dec < 31 || $dec == 127)} {
return 0
}
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
return 1
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
return [textutil::wcswidth_char $dec] ;#note textutil::wcswidth_char takes a decimal codepoint!
#may return -1 - REVIEW
}
return 0
}
proc wcswidth_unclustered1 {string} {
set width 0
foreach c [split $string {}] {
scan $c %c dec
if {$dec <= 255} {
if {$dec == 9 || ($dec >= 31 && $dec != 127)} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
incr width
}
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
return $width
}
#todo - consider disallowing/erroring out when \r \n in string?
# - tab/vtab?
# - compare with wcswidth returning -1 for entire string containing such in python,perl
proc wcswidth_unclustered {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+ (Tcl evaluation stack has to be reallocated/copied?)
#Tcl initial evaluation stack size is 2000 (? review)
#we can only split the string at arbitrary points like this because we are specifically dealing with input that has no clusters!.
set chunksize 2000
set chunks_required [expr {int(ceil([tcl::string::length $string] / double($chunksize)))}]
set width 0
set startidx 0
set endidx [expr {$startidx + $chunksize -1}]
for {set i 0} {$i < $chunks_required} {incr i} {
set chunk [tcl::string::range $string $startidx $endidx]
set codes [scan $chunk [tcl::string::repeat %c [tcl::string::length $chunk]]]
foreach dec $codes {
if {$dec <= 255} {
if {($dec ==9 || ($dec >= 31 && $dec != 127))} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcswidth
incr width
}
} elseif {$dec < 917504 || $dec > 917631} {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec]
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
incr startidx $chunksize
incr endidx $chunksize
}
return $width
}
# ------------------------------------------------------------------------------------------------------
proc wcswidth0 {string} {
#faster than textutil::wcswidth (at least for string up to a few K in length)
#..but - 'scan' is horrible for 400K+
#TODO
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set width 0
foreach dec $codes {
#unicode Tags block zero width
if {$dec < 917504 || $dec > 917631} {
if {$dec <= 255} {
#review - non-printing ascii? why does textutil::wcswidth report 1 ??
#todo - compare with python or other lang wcwidth
if {!($dec < 31 || $dec == 127)} {
incr width
}
} else {
#TODO - various other joiners and non-printing chars
set w [textutil::wcswidth_char $dec] ;#takes decimal codepoint
if {$w < 0} {
return -1
} else {
incr width $w
}
}
}
}
return $width
}
proc wcswidth2 {string} {
set codes [scan $string [tcl::string::repeat %c [tcl::string::length $string]]]
set widths [lmap dec $codes {textutil::wcswidth_char $dec}]
if {-1 in $widths} {
return -1
}
return [tcl::mathop::+ {*}$widths]
}
#prerequisites - no ansi escapes - no newlines - utf8 encoding assumed
#review - what about \r \t \b ?
#NO processing of \b - already handled in ansi::printing_length which then calls this
#this version breaks string into sequences of ascii vs unicode
proc ansifreestring_width {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
# -- --- --- --- --- --- ---
#review
#if we strip out ZWJ \u200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
#as at 2024 - textutil::wcswidth just uses the unicode east-asian width property data and doesn't seem to handle these specially - it counts this joiner and others as one wide (also BOM \uFFEF)
#TODO - once we have proper grapheme cluster splitting - work out which of these characters should be left in and/or when exactly their length-effects apply
#
#for now - strip them out
#ZWNJ \u200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u200b zero width space
#\uFFEFBOM/ ZWNBSP and others that should be zero width
#todo - work out proper way to mark/group zero width.
#set text [tcl::string::map [list \u200b "" \u200c "" \u200d "" \uFFEF ""] $text]
set text [tcl::string::map [list \u200b "" \u200c "" \u200d ""] $text]
#\uFFEF tends to print as 1 length replacement char - REVIEW
#\uFFFF varies between terminals - some print replacement char (width 1) some print nothing (width 0)
# -- --- --- --- --- --- ---
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
#c0 controls + del (127 7f) - tab
#set re_ascii_c0 {[\U0000-\U001F]}
set re_ascii_c0 {[\u0000-\u0008\u000A-\u001F\u007F]}
set text [regsub -all $re_ascii_c0 $text ""]
#c1 controls - first section of the Latin-1 Supplement block - all non-printable from a utf-8 perspective
#some or all of these may have a visual representation in other encodings e.g cp855 seems to have 1 width for them all
#we are presuming that the text supplied has been converted from any other encoding to utf8 - so we don't need to handle that here
#they should also be unlikely to be present in an ansi-free string (as is a precondition for use of this function)
set text [regsub -all {[\u0080-\u009f]+} $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\u100-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\u100-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
#punk::char::wcswidth has to split and examine dec value of each code
#By stripping controls + 7F (leaving tab) we've already eliminated the non-printable ascii - REVIEW
#return [punk::char::wcswidth $text] ;#still use our wcswidth to account for non-printable ascii
}
#split just to get the standalone character widths - and then scan for other combiners (?) - or scan for clusters first?
#review
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#tcl pre 2023-11 - braced high unicode regexes don't work
#fixed in bug-4ed788c618 2023-11
#set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
#maintain unicode as sequences - todo - scan for grapheme clusters
#set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0100-\U10FFFF\])+" $text]
set uc_sequences [punk::ansi::ta::_perlish_split "(?:\[\u0000-\u00FF\])+" $text]
set len 0
foreach {uc ascii} $uc_sequences {
#puts "-ascii $ascii"
#puts "-uc $uc"
incr len [tcl::string::length $ascii]
#textutil::wcswidth uses unicode data
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
#todo - find something that understands grapheme clusters - needed also for grapheme_split
#use punk::char::wcswidth - faster than the string split in textutil::wcswidth but still uses textutil::wcswidth_char
incr len [punk::char::wcswidth $uc]
}
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
return $len
}
#kept as a fallback for review/test if textutil::wcswidth doesn't do what we require on all terminals.
#this version looks at console testwidths if they've been cached.
#It is relatively fast - but tests unicode widths char by char - so won't be useful going forward for grapheme clusters.
proc ansifreestring_width2 {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#todo - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
#review
#if we strip out ZWJ \u0200d (zero width combiner) - then we don't give the chance for a grapheme combining test to merge properly e.g combining emojis
#as at 2024 - textutil::wcswidth doesn't seem to be aware of these - and counts the joiner as one wide
#ZWNJ \u0200c should also be zero length - but as a 'non' joiner - it should add zero to the length
#ZWSP \u0200b zero width space
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
set re_ascii_c0 {[\U0000-\U001F]}
set text [regsub -all $re_ascii_c0 $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\uFF-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
}
#review - wcswidth should detect these
set re_ascii_fullwidth {[\uFF01-\uFF5e]}
set doublewidth_char_count 0
set zerowidth_char_count 0
#split just to get the standalone character widths - and then scan for other combiners (?)
#review
#set can_regex_high_unicode [tcl::string::match [regexp -all -inline {[\U80-\U0010FFFF]} \U0001F525] \U0001F525]
#tcl pre 2023-11 - braced high unicode regexes don't work
#fixed in bug-4ed788c618 2023-11
#set uc_sequences [regexp -all -inline -indices {[\u0100-\U10FFFF]} $text]
#set uc_sequences [regexp -all -inline -indices "\[\u0100-\U10FFFF\]" $text] ;#e.g for string: \U0001f9d1abc\U001f525ab returns {0 0} {4 4}
set uc_chars [regexp -all -inline "\[\u0100-\U10FFFF\]" $text] ;#e.g return list of chars in range only
foreach c $uc_chars {
if {[regexp $re_ascii_fullwidth $c]} {
incr doublewidth_char_count
} else {
#review
# a)- terminals lie - so we could have a bad cached testwidth
# b)- textutil::wcswidth_char seems to be east-asian-width based - and not a reliable indicator of 'character cells taken by the character when printed to the terminal' despite this statement in tclllib docs.
#(character width is a complex context-dependent topic)
# c) by checking for a cached console testwidth first - we make this function less deterministic/repeatable depending on whether console tests have been run.
# d) Upstream caching of grapheme_width may also lock in a result from whatever method was first employed here
#Despite all this - the mechanism is hoped to give best effort consistency for terminals
#further work needed for combining emojis etc - which can't be done in a per character loop
#todo - see if wcswidth does any of this. It is very slow for strings that include mixed ascii/unicode - so perhaps we can use a perlish_split
# to process sequences of unicode.
#- and the user has the option to test character sets first if terminal position reporting gives better results
if {[char_info_is_testwidth_cached $c]} {
set width [char_info_testwidth_cached $c]
} else {
#textutil::wcswidth uses unicode data
#fall back to textutil::wcswidth (which doesn't for example handle diactricts/combiners so we can't use until these and other things such as \u200b and diacritics are already stripped/accounted for)
set width [textutil::wcswidth_char [scan $c %c]]
}
if {$width == 0} {
incr zerowidth_char_count
} elseif {$width == 2} {
incr doublewidth_char_count
}
}
}
#todo - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
return [expr {[tcl::string::length $text] + $doublewidth_char_count - $zerowidth_char_count}]
}
#slow - textutil::wcswidth is slow with mixed ascii uc
proc ansifreestring_width3 {text} {
#caller responsible for calling ansistrip first if text may have ansi codes - and for ensuring no newlines
#we can c0 control characters after or while processing ansi escapes.
#we need to map remaining control characters to zero-width (under assumption we're not using a font/codepage that displays them - review!)
#anyway - we must allow things like raw ESC,DEL, NUL etc to pass through without error
#if {[tcl::string::first \033 $text] >= 0} {
# error "string_width doesn't accept ansi escape sequences. Use punk::ansi::ansistrip first"
#}
#review - work out what to do with anomalies like grave combiner which print at different lengths on different terminals (fonts?) and for which cursor-query escape sequence lies.
# - various combining diacritical marks.. from grave - to various complicated unicode joiners and composing chars etc
#as at 2023 - terminals generally seem to use the simplistic approach of tallying up individual character-widths, which means combinations that print short are reported too long by the terminal esc 6 n sequence.
#- {Combining Diacritical Marks} {ranges {{start 768 end 879 note {unicode block 0300..036F}}} description {} settype block}
#- {Combining Diacritical Marks Extended} {ranges {{start 6832 end 6911 note {unicode block 1AB0..1AFF}}} description {} settype block}
#- {Combining Diacritical Marks Supplement} {ranges {{start 7616 end 7679 note {unicode block 1DC0..1DFF}}} description {} settype block}
#- {Combining Diacritical Marks for Symbols} {ranges {{start 8400 end 8447 note {unicode block 20D0..20FF}}} description {} settype block}
#- {Combining Half Marks} {ranges {{start 65056 end 65071 note {unicode block FE20..FE2F}}} description {} settype block}
#
# initial simplistic approach is just to strip these ... todo REVIEW
#experiment to detect leading diacritics - but this isn't necessary - it's still zero-width - and if the user is splitting properly we shouldn't be getting a string with leading diacritics anyway
#(leading combiners may display in terminal as mark on rightmost prompt char which is usually a space - but won't add width even then)
#set re_leading_diacritic {^(?:[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+)}
#if {[regexp $re_leading_diacritic $text]} {
# set text " $text"
#}
set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set text [regsub -all $re_diacritics $text ""]
#we should only map control sequences to nothing after processing ones with length effects, such as \b (\x07f) or DEL \x1f
#todo - document that these shouldn't be present in input rather than explicitly checking here
set re_ascii_c0 {[\U0000-\U001F]}
set text [regsub -all $re_ascii_c0 $text ""]
#short-circuit basic cases
#support tcl pre 2023-11 - see regexp bug below
#if {![regexp {[\uFF-\U10FFFF]} $text]} {
# return [tcl::string::length $text]
#}
if {![regexp "\[\uFF-\U10FFFF\]" $text]} {
return [tcl::string::length $text]
}
#slow when ascii mixed with unicode (but why?)
return [punk::char::wcswidth $text]
}
#This shouldn't be called on text containing ansi codes!
proc strip_nonprinting_ascii {str} {
#review - some single-byte 'control' chars have visual representations e.g ETX as heart depending on font/codepage
#It is currently used for screen display width calculations
#equivalent for various unicode combining chars etc?
set map [list {*}{
\x00 ""
\x07 ""
\x7f ""
}]
return [tcl::string::map $map $str]
}
#ISO-3166-1 alpha-2 country codes to flag emojis
#These can change over time.
#todo - lookup from official sources - and update as needed
#
variable flags [dict create {*}{
AC \U1F1E6\U1F1E8
AD \U1F1E6\U1F1E9
AE \U1F1E6\U1F1EA
AF \U1F1E6\U1F1EB
AG \U1F1E6\U1F1EC
AI \U1F1E6\U1F1EE
AL \U1F1E6\U1F1F1
AM \U1F1E6\U1F1F2
AO \U1F1E6\U1F1F4
AQ \U1F1E6\U1F1F6
AR \U1F1E6\U1F1F7
AS \U1F1E6\U1F1F8
AT \U1F1E6\U1F1F9
AU \U1F1E6\U1F1FA
AW \U1F1E6\U1F1FC
AX \U1F1E6\U1F1FD
AZ \U1F1E6\U1F1FF
BA \U1F1E7\U1F1E6
BB \U1F1E7\U1F1E7
BD \U1F1E7\U1F1E9
BE \U1F1E7\U1F1EA
BF \U1F1E7\U1F1EB
BG \U1F1E7\U1F1EC
BH \U1F1E7\U1F1ED
BI \U1F1E7\U1F1EE
BJ \U1F1E7\U1F1EF
BL \U1F1E7\U1F1F1
BM \U1F1E7\U1F1F2
BN \U1F1E7\U1F1F3
BO \U1F1E7\U1F1F4
BQ \U1F1E7\U1F1F6
BR \U1F1E7\U1F1F7
BS \U1F1E7\U1F1F8
BT \U1F1E7\U1F1F9
BV \U1F1E7\U1F1FB
BW \U1F1E7\U1F1FC
BY \U1F1E7\U1F1FE
BZ \U1F1E7\U1F1FF
CA \U1F1E8\U1F1E6
CC \U1F1E8\U1F1E8
CD \U1F1E8\U1F1E9
CF \U1F1E8\U1F1Eb
CG \U1F1E8\U1F1Ec
CH \U1F1E8\U1F1ED
CI \U1F1E8\U1F1EE
CK \U1F1E8\U1F1F0
CL \U1F1E8\U1F1F1
CM \U1F1E8\U1F1F2
CN \U1F1E8\U1F1F3
CO \U1F1E8\U1F1F4
CP \U1F1E8\U1F1F5
CR \U1F1E8\U1F1F7
CS \U1F1E8\U1F1F8
CU \U1F1E8\U1F1FA
CV \U1F1E8\U1F1FB
CW \U1F1E8\U1F1FC
CX \U1F1E8\U1F1FE
CY \U1F1E8\U1F1FE
CZ \U1F1E8\U1F1FF
DE \U1F1E9\U1F1EA
DG \U1F1E9\U1F1EC
DJ \U1F1E9\U1F1EF
DK \U1F1E9\U1F1F0
DM \U1F1E9\U1F1F2
DO \U1F1E9\U1F1F4
DZ \U1F1E9\U1F1FF
EA \U1F1EA\U1F1E6
EC \U1F1EA\U1F1E8
EE \U1F1EA\U1F1EA
EG \U1F1EA\U1F1EC
EH \U1F1EA\U1F1ED
ER \U1F1EA\U1F1F7
ES \U1F1EA\U1F1F8
ET \U1F1EA\U1F1F9
EU \U1F1EA\U1F1FA
FI \U1F1EB\U1F1EE
FJ \U1F1EB\U1F1EF
FK \U1F1EB\U1F1F0
FM \U1F1EB\U1F1F2
FO \U1F1EB\U1F1F4
FR \U1F1EB\U1F1F7
GA \U1F1EC\U1F1E6
GB \U1F1EC\U1F1E7
GD \U1F1EC\U1F1E9
GE \U1F1EC\U1F1EA
GF \U1F1EC\U1F1EB
GG \U1F1EC\U1F1EC
GH \U1F1EC\U1F1ED
GI \U1F1EC\U1F1EE
GL \U1F1EC\U1F1F1
GM \U1F1EC\U1F1F2
GN \U1F1EC\U1F1F3
GP \U1F1EC\U1F1F5
GQ \U1F1EC\U1F1F6
GR \U1F1EC\U1F1F7
GS \U1F1EC\U1F1F8
GT \U1F1EC\U1F1F9
GU \U1F1EC\U1F1FA
GW \U1F1EC\U1F1FC
GY \U1F1EC\U1F1FE
HK \U1F1ED\U1F1F0
HM \U1F1ED\U1F1F2
HN \U1F1ED\U1F1F3
HR \U1F1ED\U1F1F7
HT \U1F1ED\U1F1F9
HU \U1F1ED\U1F1FA
IC \U1F1EE\U1F1E8
ID \U1F1EE\U1F1E9
IE \U1F1EE\U1F1EA
IL \U1F1EE\U1F1F1
IM \U1F1EE\U1F1F2
IN \U1F1EE\U1F1F3
IO \U1F1EE\U1F1F4
IQ \U1F1EE\U1F1F6
IR \U1F1EE\U1F1F7
IS \U1F1EE\U1F1F8
IT \U1F1EE\U1F1F9
JE \U1F1EF\U1F1EA
JM \U1F1EF\U1F1F2
JO \U1F1EF\U1F1F4
JP \U1F1EF\U1F1F5
KE \U1F1F0\U1F1EA
KG \U1F1F0\U1F1EC
KH \U1F1F0\U1F1ED
KI \U1F1F0\U1F1EE
KM \U1F1F0\U1F1F2
KN \U1F1F0\U1F1F3
KP \U1F1F0\U1F1F5
KR \U1F1F0\U1F1F7
KW \U1F1F0\U1F1FC
KY \U1F1F0\U1F1FE
KZ \U1F1F0\U1F1FF
LA \U1F1F1\U1F1E6
LB \U1F1F1\U1F1E7
LC \U1F1F1\U1F1E8
LI \U1F1F1\U1F1EE
LK \U1F1F1\U1F1F0
LR \U1F1F1\U1F1F7
LS \U1F1F1\U1F1F8
LT \U1F1F1\U1F1F9
LU \U1F1F1\U1F1FA
LV \U1F1F1\U1F1FB
LY \U1F1F1\U1F1FE
MA \U1F1F2\U1F1E6
MC \U1F1F2\U1F1E8
MD \U1F1F2\U1F1E9
ME \U1F1F2\U1F1EA
MF \U1F1F2\U1F1EB
MG \U1F1F2\U1F1EC
MH \U1F1F2\U1F1ED
MK \U1F1F2\U1F1F0
ML \U1F1F2\U1F1F1
MM \U1F1F2\U1F1F2
MN \U1F1F2\U1F1F3
MO \U1F1F2\U1F1F4
MP \U1F1F2\U1F1F5
MQ \U1F1F2\U1F1F6
MR \U1F1F2\U1F1F7
MS \U1F1F2\U1F1F8
MT \U1F1F2\U1F1F9
MU \U1F1F2\U1F1FA
MV \U1F1F2\U1F1FB
MW \U1F1F2\U1F1FC
MX \U1F1F2\U1F1FD
MY \U1F1F2\U1F1FE
MZ \U1F1F2\U1F1FF
NA \U1F1F3\U1F1E6
NC \U1F1F3\U1F1E8
NE \U1F1F3\U1F1EA
NF \U1F1F3\U1F1EB
NG \U1F1F3\U1F1EC
NI \U1F1F3\U1F1EE
NL \U1F1F3\U1F1F1
NO \U1F1F3\U1F1F4
NP \U1F1F3\U1F1F5
NR \U1F1F3\U1F1F7
NU \U1F1F3\U1F1F8
NZ \U1F1F3\U1F1ff
OM \U1F1F4\U1F1F2
PA \U1F1F5\U1F1E6
PE \U1F1F5\U1F1EA
PF \U1F1F5\U1F1EB
PG \U1F1F5\U1F1EC
PH \U1F1F5\U1F1ED
PK \U1F1F5\U1F1ED
PL \U1F1F5\U1F1F1
PM \U1F1F5\U1F1F3
PN \U1F1F5\U1F1F3
PR \U1F1F5\U1F1F7
PS \U1F1F5\U1F1F8
PT \U1F1F5\U1F1F9
PW \U1F1F5\U1F1FC
PY \U1F1F5\U1F1FE
QA \U1F1F6\U1F1E6
RE \U1F1F7\U1F1EA
RO \U1F1F7\U1F1F4
RS \U1F1F7\U1F1F7
RU \U1F1F7\U1F1FA
RW \U1F1F7\U1F1FC
SA \U1F1F8\U1F1E6
SB \U1F1F8\U1F1E7
SC \U1F1F8\U1F1E8
SD \U1F1F8\U1F1E9
SE \U1F1F8\U1F1EA
SG \U1F1F8\U1F1EC
SH \U1F1F8\U1F1ED
SI \U1F1F8\U1F1EE
SJ \U1F1F8\U1F1EF
SK \U1F1F8\U1F1F0
SL \U1F1F8\U1F1F1
SM \U1F1F8\U1F1F2
SN \U1F1F8\U1F1F3
SO \U1F1F8\U1F1F4
SR \U1F1F8\U1F1F7
SS \U1F1F8\U1F1F8
ST \U1F1F8\U1F1F9
SV \U1F1F8\U1F1F7
SX \U1F1F8\U1F1F8
SY \U1F1F8\U1F1FE
SZ \U1F1F8\U1F1FF
TA \U1F1F9\U1F1E6
TC \U1F1F9\U1F1E8
TD \U1F1F9\U1F1E9
TF \U1F1F9\U1F1EB
TG \U1F1F9\U1F1EC
TH \U1F1F9\U1F1ED
TJ \U1F1F9\U1F1EF
TK \U1F1F9\U1F1F0
TL \U1F1F9\U1F1F1
TM \U1F1F9\U1F1F2
TN \U1F1F9\U1F1F3
TO \U1F1F9\U1F1F4
TR \U1F1F9\U1F1F7
TT \U1F1F9\U1F1F9
TV \U1F1F9\U1F1FB
TW \U1F1F9\U1F1FC
TZ \U1F1F9\U1F1FF
UA \U1F1FA\U1F1E6
UG \U1F1FA\U1F1EC
UM \U1F1FA\U1F1F2
US \U1F1FA\U1F1F8
UY \U1F1FA\U1F1FE
UZ \U1F1FA\U1F1FF
VA \U1F1FB\U1F1E6
VC \U1F1FB\U1F1E8
VE \U1F1FB\U1F1EA
VG \U1F1FB\U1F1EC
VI \U1F1FB\U1F1EE
VN \U1F1FB\U1F1F3
VU \U1F1FB\U1F1FA
WF \U1F1FC\U1F1EB
WS \U1F1FC\U1F1F8
XK \U1F1FD\U1F1F3
YE \U1F1FE\U1F1EA
YT \U1F1FE\U1F1F9
ZA \U1F1FF\U1F1E6
ZM \U1F1FF\U1F1F2
ZW \U1F1FF\U1F1FC
}]
variable rflags
dict for {k v} $flags {
dict set rflags $v $k
}
proc flag_from_ascii {code} {
variable flags
if {[regexp {^[A-Z]{2}$} $code]} {
if {[dict exists $flags $code]} {
return [dict get $flags $code]
} else {
error "unsupported flags code: $code"
}
} else {
#try as subregion
#e.g gbeng,gbwls,gbsct
return \U1f3f4[tag_from_ascii $code]\Ue007f
}
}
proc flag_to_ascii {charsequence} {
variable rflags
if {[dict exists $rflags $charsequence]} {
return [dict get $rflags $charsequence]
}
if {[string index $charsequence 0] eq "\U1F3F4" && [string index $charsequence end] eq "\UE007F"} {
#subdivision flag
set tag [string range $charsequence 1 end-1]
return [tag_to_ascii $tag]
}
error "unknown flag $charsequence"
}
proc tag_to_ascii {t} {
set fmt [string repeat %c [string length $t]]
set declist [scan $t $fmt]
#unicode Tags block - e0000 to e007f
set declist [lmap dec $declist {
if {$dec < 917504 || $dec > 917631} {
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in unicode Tags block range 917504-917631 (e0000-e007f)"
}
incr dec -917504
}]
return [format $fmt {*}$declist]
}
proc tag_from_ascii {a} {
set fmt [string repeat %c [string length $a]]
set declist [scan $a $fmt]
set declist [lmap dec $declist {
if {$dec > 127} {
error "char [ansistring VIEW -lf 1 -cr 1 -vt 1 [format %c $dec]] has decimal value $dec. Not in ascii range 0-127"
}
incr dec 917504
}]
return [format $fmt {*}$declist]
}
#split into plaintext and runs of combiners (combining diacritical marks - not ZWJ or ZWJNJ)
proc combiner_split {text} {
#split into odd numbered list (or zero) in a similar way to punk::ansi::ta::_perlish_split
#
#set re_diacritics {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+}
set graphemes [list]
if {[tcl::string::length $text] == 0} {
return {}
}
set list [list]
set start 0
set strlen [tcl::string::length $text]
#make sure our regexes aren't non-greedy - or we may not have exit condition for loop
#review
while {$start < $strlen && [regexp -start $start -indices -- {[\u0300-\u036f]+|[\u1ab0-\u1aff]+|[\u1dc0-\u1dff]+|[\u20d0-\u20ff]+|[\ufe20-\ufe2f]+} $text match]} {
lassign $match matchStart matchEnd
#puts "->start $start ->match $matchStart $matchEnd"
lappend list [tcl::string::range $text $start $matchStart-1] [tcl::string::range $text $matchStart $matchEnd]
set start [expr {$matchEnd+1}]
}
lappend list [tcl::string::range $text $start end]
}
#ZWJ ZWNJ ?
#SWSP ?
#1st shot - basic diacritics
#todo - become aware of unicode grapheme cluster boundaries
#This is difficult in Tcl without unicode property based Character Classes in the regex engine
#review - this needs to be performant - it is used a lot by punk terminal/ansi features
#todo - trie data structures for unicode?
#for now we can at least combine diacritics
#should also handle the ZWJ (and the variation selectors? eg \uFE0F) character which should account for emoji clusters
#Note - emoji cluster could be for example 11 code points/41 bytes (family emoji with skin tone modifiers for each member, 3 ZWJs)
#This still leaves a whole class of clusters.. korean etc unhandled :/
#todo - tk::startOfCluster / tk::endOfCluster - try to get it brought into Tcl
#https://core.tcl-lang.org/tcl/tktview/a4c7eeaf63
proc grapheme_split {text {return list}} {
#we should treat \r\n as a single grapheme cluster (as tk::endOfCluster does)
set components [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set clist [split $pt ""]
#review
#lset clist end [tcl::string::cat [lindex $clist end] $combiners]
ledit clist end end [tcl::string::cat [lindex $clist end] $combiners]
lappend components {*}$clist
#lappend components {*}[lrange $clist 0 end-1]
#lappend components [tcl::string::cat [lindex $clist end] $combiners]
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend components {*}[split [lindex $csplits end] ""]
}
#we have a list of diacritically assembled components - we now need to combine any ZWJ sequences into single grapheme clusters as well as \r\n
#https://www.unicode.org/Public/17.0.0/ucd/auxiliary/GraphemeBreakTest.html
#Emoji modifiers \U1f3fb to \U1f3ff are also zero-width and combine with the preceding char
#they don't form a 'base' for combining with other characters though.
#blocks that can contain zero-width joiners and combining chars that combine with the preceding char:
# Emoticons (1F600–1F64F)
# Miscellaneous Symbols and Pictographs (1F300–1F5FF)
# Supplemental Symbols and Pictographs (1F900–1F9FF)
# Symbols and Pictographs Extended-A (1FA70–1FAFF)
# Transport and Map Symbols (1F680–1F6FF)
# Dingbats (2700–27BF)
# Miscellaneous Symbols (2600–26FF)
#RI (regional indicators) (\U1f1e6-\U1f1ff) combine in pairs to form flag emojis - but they don't combine with modifiers or ZWJs to form longer clusters.
#review \uFE0F variation selector 16 - forces emoji presentation for preceding char
#This is a basic implementation that does not check that all combinations are valid.
set graphemes [list]
set current_cluster ""
set cluster_base 0 ;#is the current cluster based on a char that can be combined with modifiers/ZWJs (e.g emoji or other cluster-based char)
# or is it based on a char that can't be combined with modifiers/ZWJs (e.g ascii letter)
set cluster_base_RI 0 ;#is the current cluster based on a regional indicator char - which can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
set current_cluster_is_extensible 0
for {set i 0} {$i < [llength $components] } {incr i} {
set component [lindex $components $i]
if {$component eq "\r" && [lindex $components $i+1] eq "\n"} {
if {$current_cluster ne ""} {
lappend graphemes $current_cluster
}
lappend graphemes "\r\n"
incr i ;#skip the \n as we've already processed it as part of the cluster
set current_cluster ""
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
set current_cluster_is_extensible 0
} elseif {$component eq "\u200d"} {
if {$current_cluster eq ""} {
#ZWJ at start of string - treat as separate grapheme cluster - but isn't a valid base for further combining with more ZWJs or modifiers
set current_cluster $component
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
set current_cluster_is_extensible 0
} else {
if {$cluster_base} {
if {$current_cluster_is_extensible} {
#a double (or longer) ZWJ sequence in a row is part of the last cluster - but not extensible anymore.
append current_cluster $component
set current_is_cluster_extensible 0
} else {
append current_cluster $component
if {$cluster_base_RI} {
#regional indicators can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
set current_cluster_is_extensible 0
#we can keep adding ZWJs or modifiers though
} else {
set current_cluster_is_extensible 1
}
}
} else {
#ZWJ after non-cluster-based char - non extensible but we continue appending ZWJs to the current cluster.
append current_cluster $component
set current_cluster_is_extensible 0
}
}
} elseif {[regexp {[\U1f3fb-\U1f3ff]} $component]} {
#emoji modifier - join with previous component
if {$current_cluster eq ""} {
#modifier at start of string - not a valid base for further combining with more modifiers or ZWJs - but we continue appending modifiers to the current cluster.
set current_cluster $component
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
} else {
if {$cluster_base} {
if {$current_cluster_is_extensible} {
append current_cluster $component
#invalidate the base!
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
} else {
append current_cluster $component
}
} else {
#modifier after non-cluster-based char - non extensible but we continue appending modifiers to the current cluster.
append current_cluster $component
}
#review
# \u1f33e\u1f3fe\u200d\u2f3fe\u200d\u1f33e is 2 clusters
#This is because after first zwj, we applied a modifier - not a valid base.
}
set current_cluster_is_extensible 0
} else {
if {$current_cluster eq ""} {
grapheme_split::start_cluster $component
} else {
#have existing cluster data
if {$current_cluster_is_extensible} {
#assert - if current_cluster_is_extensible then cluster_base should currently be true.
#if the current char is a base - we can append to existing cluster, but if it's not a base, then we start a new cluster even if we had seen a ZWJ before.
if {[regexp {[\U1f600-\U1f64f\U1f300-\U1f5ff\U1f900-\U1f9ff\U1fa70-\U1faff\U1f680-\U1f6ff\U2700-\U27bf\U2600-\u26ff]} $component]} {
append current_cluster $component
set cluster_base 1
} else {
lappend graphemes $current_cluster
set current_cluster $component
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
}
set current_cluster_is_extensible 0
} elseif {$cluster_base_RI} {
#regional indicators can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
if {[regexp {[\U1f1e6-\U1f1ff]} $component]} {
append current_cluster $component
#invalidate the base - we can't combine more than 2 RIs in a cluster, and they don't combine with modifiers or ZWJs to form longer clusters.
#we can however add more ZWJs or modifiers to the cluster - but they don't make it extensible for combining with more RIs
set cluster_base 0; set cluster_base_RI 0 ;#grapheme_split::reset_base
} else {
#something else while RI cluster is open - end the current cluster and start a new one with the current char.
lappend graphemes $current_cluster
grapheme_split::start_cluster $component
}
set current_cluster_is_extensible 0
} else {
lappend graphemes $current_cluster
grapheme_split::start_cluster $component
}
}
}
}
if {$current_cluster ne ""} {
lappend graphemes $current_cluster
}
if {$return eq "list"} {
return $graphemes
} else {
return [dict create list $graphemes last_extensible $current_cluster_is_extensible base $cluster_base RI_base $cluster_base_RI]
}
}
namespace eval grapheme_split {
proc about {} {
return "helper functions for the punk::char::grapheme_split proc - not for general use outside of that context"
}
proc reset_base {} {
upvar cluster_base cluster_base
upvar cluster_base_RI cluster_base_RI
set cluster_base 0
set cluster_base_RI 0
}
proc start_cluster {component} {
upvar current_cluster current_cluster
upvar cluster_base cluster_base
upvar cluster_base_RI cluster_base_RI
upvar current_cluster_is_extensible current_cluster_is_extensible
set current_cluster $component
# emoticons symbols/pictographs supplemental symbols/pictographs extended-a symbols/pictographs transport/map symbols dingbats misc symbols
if {[regexp {[\U1f600-\U1f64f\U1f300-\U1f5ff\U1f900-\U1f9ff\U1fa70-\U1faff\U1f680-\U1f6ff\U2700-\U27bf\U2600-\u26ff]} $component]} {
set cluster_base 1
set cluster_base_RI 0
} elseif {[regexp {[\U1f1e6-\U1f1ff]} $component]} {
#regional indicator - can only combine in pairs, and don't combine with modifiers or ZWJs to form longer clusters - so not extensible.
set cluster_base 1
set cluster_base_RI 1
} else {
#grapheme_split::reset_base
set cluster_base 0
set cluster_base_RI 0
}
#not extensible until we see a ZWJ.
set current_cluster_is_extensible 0
}
}
#https://thottingal.in/blog/2026/03/22/complex-scripts-in-terminal/#:~:text=Once%20shaping%20is%20complete%2C%20glyphs,a%20n%20c%20e%20%E2%8C%89
#Malayalam test string with 3 ligatures, 7 codepoints - and should be 3 grapheme clusters.
set str ""
# ------------------------------------------------------------------------------------------------------
#test
# ------------------------------------------------------------------------------------------------------
proc grapheme_split_tk {string} {
#note that the sequence \r\n is generally seen as a single grapheme cluster.
#if {![regexp "\[\uFF-\U10FFFF\]" $string]} {
# #only ascii (7 or 8 bit) - no joiners or unicode
# return [split $string {}]
#}
package require tk
set i 0
set graphemes [list]
while {$i < [tcl::string::length $string]} {
set aftercluster [tk::endOfCluster $string $i]
lappend graphemes [string range $string $i $aftercluster-1]
set i $aftercluster
}
return $graphemes
}
proc grapheme_split_dec {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]] ;#warning scan %c... slow for v large strings (e.g 400k+)
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
lappend graphemes {*}$pt_decs
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[scan [lindex $csplits end] [tcl::string::repeat %c [tcl::string::length [lindex $csplits end]]]]
}
return $graphemes
}
proc grapheme_split_dec2 {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} $csplits {
set pt_decs [scan $pt [tcl::string::repeat %c [tcl::string::length $pt]]]
if {$combiners ne ""} {
set combiner_decs [scan $combiners [tcl::string::repeat %c [tcl::string::length $combiners]]]
lset pt_decs end [concat [lindex $pt_decs end] $combiner_decs]
}
lappend graphemes {*}$pt_decs
}
return $graphemes
}
proc grapheme_split2 {text} {
set graphemes [list]
set csplits [combiner_split $text]
foreach {pt combiners} [lrange $csplits 0 end-1] {
set clist [split $pt ""]
lappend graphemes {*}[lrange $clist 0 end-1] [tcl::string::cat [lindex $clist end] $combiners]
}
#last csplit never has a combiner (_perlish_split style) - and may be empty - in which case we don't append it as a grapheme
if {[lindex $csplits end] ne ""} {
lappend graphemes {*}[split [lindex $csplits end] ""]
}
return $graphemes
}
# -- --- --- --- ---
#will accept a single char or a string - test using console cursor position reporting
#unreliable
proc char_info_testwidth {ch {emit 0}} {
package require punk::console
#uses cursor movement and relies on console to report position.. which it may do properly for single chars - but may misreport for combinations that combine into a single glyph
tailcall punk::console::test_char_width $ch $emit
}
proc char_info_testwidth_cached {char} {
variable charinfo
set dec [scan $char %c]
set twidth ""
if {[tcl::dict::exists $charinfo $dec testwidth]} {
set twidth [tcl::dict::get $charinfo $dec testwidth]
}
if {$twidth eq ""} {
set width [char_info_testwidth $char]
tcl::dict::set charinfo $dec testwidth $width
return $width
} else {
return $twidth
}
}
proc char_info_is_testwidth_cached {char} {
variable charinfo
return [tcl::dict::exists $charinfo [scan $char %c] testwidth]
}
# -- --- --- --- ---
}
tcl::namespace::eval punk::char::lib {
variable num_superscript
#digits and a small set of related symbols
set num_superscript [list {*}{
i \u2071
0 \u2070
1 \u00B9
2 \u00B2
3 \u00B3
4 \u2074
5 \u2075
6 \u2076
7 \u2077
8 \u2078
9 \u2079
+ \u207A
- \u207B
= \u207C
( \u207D
) \u207E
n \u207F
}]
variable num_supersub_re
set num_supersub_re {^[0-9in+\-\(\)\=]+$}
proc superscript_number {n} {
if {$n eq ""} {return ""}
variable num_superscript
variable num_supersub_re
if {![regexp $num_supersub_re $n]} {
error "Cannot convert string '$n' to superscript. valid chars: [dict keys $num_superscript]"
}
return [string map $num_superscript $n]
}
set num_subscript [list {*}{
i \u1D62
0 \u2080
1 \u2081
2 \u2082
3 \u2083
4 \u2084
5 \u2085
6 \u2086
7 \u2087
8 \u2088
9 \u2089
+ \u208A
- \u208B
= \u208C
( \u208D
) \u208E
n \u2099
}]
proc subscript_number {n} {
if {$n eq ""} {return ""}
variable num_subscript
variable num_supersub_re
if {![regexp $num_supersub_re $n]} {
error "Cannot convert string '$n' to subcript. valid chars: [dict keys $num_subscript]"
}
return [string map $num_subscript $n]
}
}
# -----------------------------------------------------------------------------
# register namespace(s) to have PUNKARGS,PUNKARGS_aliases variables checked
# -----------------------------------------------------------------------------
# variable PUNKARGS
# variable PUNKARGS_aliases
namespace eval ::punk::args::register {
#use fully qualified so 8.6 doesn't find existing var in global namespace
lappend ::punk::args::register::NAMESPACES ::punk::char
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Ready
package provide punk::char [tcl::namespace::eval punk::char {
variable version
set version 0.1.0
}]
return
#*** !doctools
#[manpage_end]