You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

266 lines
8.4 KiB

# -*- tcl -*-
# Maintenance Instruction: leave the 999999.xxx.x as is and use punkshell 'dev make' or bin/punkmake to update from <pkg>-buildversion.txt
# module template: punkshell/src/decktemplates/vendor/punk/modules/template_module-0.0.3.tm
#
# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem.
# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# (C) 2024 DasBrain
# http://paste.tclers.tk/5977
#
# @@ Meta Begin
# Application punk::cesu 999999.0a1.0
# Meta platform tcl
# Meta license <unknown>
# @@ Meta End
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# doctools header
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[manpage_begin punkshell_module_punk::cesu 0 999999.0a1.0]
#[copyright "2024"]
#[titledesc {CESU compatibility ehcoding scheme for utf-16: 8-Bit (CESU-8) ??}] [comment {-- Name section and table of contents description --}]
#[moddesc {CESU experimental}] [comment {-- Description at end of page heading --}]
#[require punk::cesu]
#[keywords module cesu encoding compatibility experimental unofficial]
#[description]
#[para] experimental
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Overview]
#[para] overview of punk::cesu
#[subsection Concepts]
#[para] cesu-8 may be mistaken for utf-8 if no supplementary chars present.
#[para] see: https://www.unicode.org/reports/tr26/tr26-4.html
#[para] Particulary note discouragement of use especially in external interchange.
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Requirements
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[subsection dependencies]
#[para] packages used by punk::cesu
#[list_begin itemized]
package require Tcl 8.6-
#*** !doctools
#[item] [package {Tcl 8.6}]
# #package require frobz
# #*** !doctools
# #[item] [package {frobz}]
#*** !doctools
#[list_end]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section API]
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Base namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::cesu {
tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase
#variable xyz
#*** !doctools
#[subsection {Namespace punk::cesu}]
#[para] Core API functions for punk::cesu
#[list_begin definitions]
proc formatQuery args {
package require http
http::config ;#call something in http lib - as http lib uses auto_index so is lazy about creating ::http namespace etc.
set result ""
set sep ""
foreach i $args {
append result $sep [mapReply $i]
if {$sep eq "="} {
set sep &
} else {
set sep =
}
}
return $result
}
proc mapReply string {
variable ::http::formMap
set string [encoding convertto utf-8 $string]
set string [cesu2utf $string]
return [string map $formMap $string]
}
proc cesu2utf str {
#hacked by JMN - as original seemed broken and intention as to input is unclear
if {[regexp {\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])} $str]} {
#set str [string map {\ \\ \[ \\\[ \] \\\]} $str] ;#original -broken - unsure of usecase/intention
set str [string map {\\ \\\\ \[ \\\[ \] \\\]} $str] ;#guess intention is to stop premature substitution of escapes and commands
#return [subst -novariables [regsub -all {^\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])$} $str {[cesu2utfR \1 \2 \3 \4]} ]] ;#original. anchoring seems unlikely to be desirable
return [subst -novariables [regsub -all {\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])} $str {[cesu2utfR \1 \2 \3 \4]} ]]
} else {
return $str
}
}
proc cesu2utfR {1 2 3 4} {
# UTF-8: 11110xxx 10xx xxxx 10xx xxxx 10xxxxxx
# CESU-8: 11101101 1010 yy yy 10xxxx xx 11101101 1011xxxx 10xxxxxx
binary scan $1 c 1
binary scan $2 c 2
binary scan $3 c 3
puts [list $1 $2 $3]
#binary scan $4 c 4
incr 1
return [binary format ccca \
[expr {0xF0 | (($1 & 0xC) >> 2)}] \
[expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \
[expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \
$4]
}
#
proc cesu2utfC char {
# UTF-8: 11110xxx 10xx xxxx 10xx xxxx 10xxxxxx
# CESU-8: 11101101 1010 yy yy 10xxxx xx 11101101 1011xxxx 10xxxxxx
if {[regexp {^\xED([\xA0-\xAF])([\x80-\xBF])\xED([\xB0-\xBF])([\x80-\xBF])$} $char -> 1 2 3 4]} {
binary scan $1 c 1
binary scan $2 c 2
binary scan $3 c 3
puts [list $1 $2 $3]
#binary scan $4 c 4
incr 1
return [binary format ccca \
[expr {0xF0 | (($1 & 0xC) >> 2)}] \
[expr {0x80 | (($1 & 0x3) << 4) | (($2 & 0x3C) >> 2)}] \
[expr {0x80 | (($2 & 0x3) << 4) | ($3 & 0xF)}] \
$4]
} else {
puts "Invalid sequence: $char"
return $char
}
}
#data
# surrogate pair \ud83d\udcc4 'document icon' = \U1F4C4
#\U1f400 'mouse'
#\U1f600 'smiley'
proc test1 {} {
#JMN
#package require Tcl 9-
set c [encoding convertto cesu-8 \ud83d\ude10]
set x [cesu2utfC $c]
encoding convertfrom utf-8 $x
}
#e.g test2 "note \ud83f\udd1e etc"
#e.g test2 "faces \ud83d\ude10 \ud83d\ude21 \ud83d\ude31"
#note: test2 \U1f600 returns a mouse (\U1f400) instead of smiley
# but test2 \U1f400 returns a mouse.
# Either surrogated_string shouldn't include non BMP chars anyway (G.I.G.O?).. or we're doing something wrong.
proc test2 {surrogated_string} {
#JMN
set cesu [encoding convertto cesu-8 $surrogated_string]
set x [cesu2utf $cesu]
encoding convertfrom utf-8 $x
}
#
#test_enc_equivalency \U1f400 \U1f600
proc test_enc_equivalency {c1 c2} {
package require punk::ansi
namespace import ::punk::ansi::a+ ::punk::ansi::a
foreach enc [lsort [encoding names]] {
puts stdout "testing $enc"
if {$enc in "iso2022 iso2022-jp iso2022-kr"} {
puts stderr "skipping $enc - crashes tcl9 on non BMP codepoints"
continue
}
if {[catch {
set equiv [string equal [encoding convertto $enc $c1] [encoding convertto $enc $c2]]
if {$equiv} {
puts stdout "[a+ web-green]c1 = c2 under $enc[a]"
} else {
puts stdout "[a+ web-red]c1 not equal to c2 under $enc[a]"
}
} errM]} {
puts stdout "error encoding c1/c2 under $enc\n $errM"
}
}
}
#*** !doctools
#[list_end] [comment {--- end definitions namespace punk::cesu ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# Secondary API namespace
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
tcl::namespace::eval punk::cesu::lib {
tcl::namespace::export {[a-z]*} ;# Convention: export all lowercase
tcl::namespace::path [tcl::namespace::parent]
#*** !doctools
#[subsection {Namespace punk::cesu::lib}]
#[para] Secondary functions that are part of the API
#[list_begin definitions]
#proc utility1 {p1 args} {
# #*** !doctools
# #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]]
# #[para]Description of utility1
# return 1
#}
#*** !doctools
#[list_end] [comment {--- end definitions namespace punk::cesu::lib ---}]
}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
#*** !doctools
#[section Internal]
#tcl::namespace::eval punk::cesu::system {
#*** !doctools
#[subsection {Namespace punk::cesu::system}]
#[para] Internal functions that are not part of the API
#}
# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++
## Ready
package provide punk::cesu [tcl::namespace::eval punk::cesu {
variable pkg punk::cesu
variable version
set version 999999.0a1.0
}]
return
#*** !doctools
#[manpage_end]