# -*- tcl -*- # Maintenance Instruction: leave the 999999.xxx.x as is and use punkshell 'dev make' or bin/punkmake to update from -buildversion.txt # module template: punkshell/src/decktemplates/vendor/punk/modules/template_module-0.0.3.tm # # Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. # Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # (C) 2024 DasBrain # http://paste.tclers.tk/5977 # # @@ Meta Begin # Application punk::cesu 999999.0a1.0 # Meta platform tcl # Meta license # @@ Meta End # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # doctools header # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[manpage_begin punkshell_module_punk::cesu 0 999999.0a1.0] #[copyright "2024"] #[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}] #[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}] #[require punk::cesu] #[keywords module cesu encoding compatibility experimental unofficial] #[description] #[para] experimental # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section Overview] #[para] overview of punk::cesu #[subsection Concepts] #[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present. #[para] see: https://www.unicode.org/reports/tr26/tr26-4.html #[para] Particulary note discouragement of use especially in external interchange. # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[subsection dependencies] #[para] packages used by punk::cesu #[list_begin itemized] package require Tcl 8.6- #*** !doctools #[item] [package {Tcl 8.6}] # #package require frobz # #*** !doctools # #[item] [package {frobz}] #*** !doctools #[list_end] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section API] # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Base namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ tcl::namespace::eval punk::cesu { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase #variable xyz #*** !doctools #[subsection {Namespace punk::cesu}] #[para] Core API functions for punk::cesu #[list_begin definitions] proc formatQuery args { package require http http::config ;#call something in http lib - as http lib uses auto_index so is lazy about creating ::http namespace etc. set result "" set sep "" foreach i $args { append result $sep [mapReply $i] if {$sep eq "="} { set sep & } else { set sep = } } return $result } proc mapReply string { variable ::http::formMap set string [encoding convertto utf-8 $string] set string [cesu2utf $string] return [string map $formMap $string] } proc cesu2utf str { #hacked by JMN - as original seemed broken and intention as to input is unclear if {[regexp {\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])} $str]} { #set str [string map {\ \\ \[ \\\[ \] \\\]} $str] ;#original -broken - unsure of usecase/intention set str [string map {\\ \\\\ \[ \\\[ \] \\\]} $str] ;#guess intention is to stop premature substitution of escapes and commands #return [subst -novariables [regsub -all {^\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])$} $str {[cesu2utfR \1 \2 \3 \4]} ]] ;#original. anchoring seems unlikely to be desirable return [subst -novariables [regsub -all {\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])} $str {[cesu2utfR \1 \2 \3 \4]} ]] } else { return $str } } proc cesu2utfR {1 2 3 4} { # UTF-8: 11110xxx 10xx xxxx 10xx xxxx 10xxxxxx # CESU-8: 11101101 1010 yy yy 10xxxx xx 11101101 1011xxxx 10xxxxxx binary scan $1 c 1 binary scan $2 c 2 binary scan $3 c 3 puts [list $1 $2 $3] #binary scan $4 c 4 incr 1 return [binary format ccca \ [expr {0xF0 | (($1 & 0xC) >> 2)}] \ [expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \ [expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \ $4] } # proc cesu2utfC char { # UTF-8: 11110xxx 10xx xxxx 10xx xxxx 10xxxxxx # CESU-8: 11101101 1010 yy yy 10xxxx xx 11101101 1011xxxx 10xxxxxx if {[regexp {^\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])$} $char -> 1 2 3 4]} { binary scan $1 c 1 binary scan $2 c 2 binary scan $3 c 3 puts [list $1 $2 $3] #binary scan $4 c 4 incr 1 return [binary format ccca \ [expr {0xF0 | (($1 & 0xC) >> 2)}] \ [expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \ [expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \ $4] } else { puts "Invalid sequence: $char" return $char } } #data # surrogate pair \ud83d\udcc4 'document icon' = \U1F4C4 #\U1f400 'mouse' #\U1f600 'smiley' proc test1 {} { #JMN #package require Tcl 9- set c [encoding convertto cesu-8 \ud83d\ude10] set x [cesu2utfC $c] encoding convertfrom utf-8 $x } #e.g test2 "note \ud83f\udd1e etc" #e.g test2 "faces \ud83d\ude10 \ud83d\ude21 \ud83d\ude31" #note: test2 \U1f600 returns a mouse (\U1f400) instead of smiley # but test2 \U1f400 returns a mouse. # Either surrogated_string shouldn't include non BMP chars anyway (G.I.G.O?).. or we're doing something wrong. proc test2 {surrogated_string} { #JMN set cesu [encoding convertto cesu-8 $surrogated_string] set x [cesu2utf $cesu] encoding convertfrom utf-8 $x } # #test_enc_equivalency \U1f400 \U1f600 proc test_enc_equivalency {c1 c2} { package require punk::ansi namespace import ::punk::ansi::a+ ::punk::ansi::a foreach enc [lsort [encoding names]] { puts stdout "testing $enc" if {$enc in "iso2022 iso2022-jp iso2022-kr"} { puts stderr "skipping $enc - crashes tcl9 on non BMP codepoints" continue } if {[catch { set equiv [string equal [encoding convertto $enc $c1] [encoding convertto $enc $c2]] if {$equiv} { puts stdout "[a+ web-green]c1 = c2 under $enc[a]" } else { puts stdout "[a+ web-red]c1 not equal to c2 under $enc[a]" } } errM]} { puts stdout "error encoding c1/c2 under $enc\n $errM" } } } #*** !doctools #[list_end] [comment {--- end definitions namespace punk::cesu ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # Secondary API namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ tcl::namespace::eval punk::cesu::lib { tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase tcl::namespace::path [tcl::namespace::parent] #*** !doctools #[subsection {Namespace punk::cesu::lib}] #[para] Secondary functions that are part of the API #[list_begin definitions] #proc utility1 {p1 args} { # #*** !doctools # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] # #[para]Description of utility1 # return 1 #} #*** !doctools #[list_end] [comment {--- end definitions namespace punk::cesu::lib ---}] } # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ #*** !doctools #[section Internal] #tcl::namespace::eval punk::cesu::system { #*** !doctools #[subsection {Namespace punk::cesu::system}] #[para] Internal functions that are not part of the API #} # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Ready package provide punk::cesu [tcl::namespace::eval punk::cesu { variable pkg punk::cesu variable version set version 999999.0a1.0 }] return #*** !doctools #[manpage_end]