diff --git a/src/bootsupport/modules/punk/mix/base-0.1.tm b/src/bootsupport/modules/punk/mix/base-0.1.tm index 57f7f5a7..ea9fc85f 100644 --- a/src/bootsupport/modules/punk/mix/base-0.1.tm +++ b/src/bootsupport/modules/punk/mix/base-0.1.tm @@ -433,7 +433,7 @@ namespace eval punk::mix::base { set default_hash sha1 ;#but fall back to md5 if either sha1 is unavailable or unaccelerated (pure tcl sha1 is way slower than pure tcl md5 - can take minutes on even moderate sized source files) if {![catch {package require sha1}]} { set impls [::sha1::Implementations] - if {[llength $impls] == 1} { + if {[llength $impls] == 1 && [string tolower [lindex $impls 0]] eq "tcl"} { set default_hash md5 } } else { diff --git a/src/bootsupport/modules/punkcheck-0.1.0.tm b/src/bootsupport/modules/punkcheck-0.1.0.tm index 8f830844..e0bb8b65 100644 --- a/src/bootsupport/modules/punkcheck-0.1.0.tm +++ b/src/bootsupport/modules/punkcheck-0.1.0.tm @@ -340,7 +340,9 @@ namespace eval punkcheck { set ts_now [clock microseconds] set metadata_us [expr {$ts_now - $ts_start}] - dict set installing_record -metadata_us $metadata_us + #?? + #JJJ + #dict set installing_record -metadata_us $metadata_us dict set installing_record -ts_start_transfer $ts_now lset fileinfo_body end $installing_record @@ -473,6 +475,7 @@ namespace eval punkcheck { set path_cksum_caches [list] } set o_fileset_record [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $rel_source_path $o_fileset_record $path_cksum_caches] + #JJJ - update -metadata_us here? } method targetset_last_complete {} { @@ -1618,13 +1621,23 @@ namespace eval punkcheck { continue } #puts stdout " checking file : $current_source_dir/$m" - set thismatchdir [file dirname $m] - if {$last_processed_dir ne $thismatchdir} { - puts stdout "\n checking files in [file join $current_source_dir $thismatchdir]" - set last_processed_dir $thismatchdir + #give some output - but not too deep + #set thismatchdir [file dirname [file dirname [file join $srcdir $relative_source_dir]]] + #if {$last_processed_dir ne $thismatchdir} { + # puts stdout "\n checking files in $thismatchdir" + # set last_processed_dir $thismatchdir + #} else { + # puts -nonewline stdout . + #} + if {$last_processed_dir ne $CALLDEPTH} { + puts -nonewline stdout \n[string repeat " " $CALLDEPTH][file tail $relative_source_dir] + puts -nonewline stdout \n[string repeat " " $CALLDEPTH]. + flush stdout + set last_processed_dir $CALLDEPTH } else { puts -nonewline stdout . } + set ts_start [clock microseconds] set seconds [expr {$ts_start / 1000000}] set ts_start_iso [clock format $seconds -format "%Y-%m-%dT%H:%M:%S"] @@ -1670,7 +1683,19 @@ namespace eval punkcheck { set ts2 [clock milliseconds] set diff [expr {$ts2 - $ts1}] if {$diff > 100} { - puts stderr "punkcheck: performance warning: fetch_metdata for $m took $diff ms" + set errprefix ">>> punkcheck:" + puts stderr "$errprefix performance warning: fetch_metadata for $m took $diff ms." + set lb [lindex [dict get $filerec body] end] + #puts stderr "$errprefix filerec last body record:$lb" + set records [dict get $lb body] + set lr [lindex $records end] + set alg [dict get $lr -cksum_all_opts -cksum_algorithm] + if {$alg eq "sha1"} { + puts "$errprefix cksum_algorithm: sha1 (accelerators: [::sha1::Implementations])" + puts "$errprefix sha1 from: [package ifneeded sha1 [package present sha1]]" + } else { + puts "$errprefix cksum_algorithm: $alg" + } } @@ -1806,6 +1831,8 @@ namespace eval punkcheck { if {$max_depth != -1 && $CALLDEPTH >= $max_depth} { #don't process any more subdirs + #sometimes deliberately called with max_depth 1 - so don't warn here. review + #puts stderr "punkcheck::install warning - reached max_depth $max_depth" set subdirs [list] } else { set subdirs [glob -nocomplain -dir $current_source_dir -type d -tail *] @@ -1895,7 +1922,7 @@ namespace eval punkcheck { if {[llength $files_copied] || [llength $files_skipped]} { #puts stdout ">>>>>>>>>>>>>>>>>>>" set saveresult [punkcheck::save_records_to_file $punkcheck_records $punkcheck_file] - puts stdout "punkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" + puts stdout "\npunkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" #puts stdout ">>>>>>>>>>>>>>>>>>>" } else { #todo - write db INSTALLER record if -debug true diff --git a/src/make.tcl b/src/make.tcl index 835fee21..853185bf 100644 --- a/src/make.tcl +++ b/src/make.tcl @@ -22,7 +22,7 @@ namespace eval ::punkboot { variable pkg_requirements [list]; variable pkg_missing [list];variable pkg_loaded [list] variable non_help_flags [list -k] variable help_flags [list -help --help /? -h] - variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate] + variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate ] } @@ -242,12 +242,12 @@ if {[file tail $startdir] eq "src"} { } } # ------------------------------------------------------------------------------------- +set original_tm_list [tcl::tm::list] +set original_auto_path $::auto_path set package_paths_modified 0 if {$bootsupport_paths_exist || $sourcesupport_paths_exist} { - set original_tm_list [tcl::tm::list] tcl::tm::remove {*}$original_tm_list - set original_auto_path $::auto_path #very basic test there is something there.. set support_contents_exist 0 @@ -1256,6 +1256,10 @@ if {$::punkboot::command eq "check"} { # - package path restore original module paths and auto_path entries to take effect in addition to bootsupport paths # - Order such that bootsupport entries are always higher priority (if same version number - prefer bootsupport) # - This must be done between the two "check" command sections + +# Ideally we would be running make.tcl purely from bootsupport packages - but binary packages such as Thread are required, +# and without accelerators the performance is abysmal (e.g minutes vs seconds for common sha1 operations) + if {$package_paths_modified} { set tm_list_boot [tcl::tm::list] tcl::tm::remove {*}$tm_list_boot @@ -1271,6 +1275,25 @@ if {$package_paths_modified} { lappend ::auto_path {*}$original_auto_path } # -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +#The problem with forcing our main packages to load only from bootsupport/sourcesupport is that if they require packages that use accelerators +# and no acceleration is available in bootsupport/sourcesupport - then they will stay unaccelerated even if the os packages provide it +# Also - some packages such as struct::set don't seem to handle reloading after forget (at least - not without destroying the command first) +#Packages that provide acceleration don't use a consistent API for testing acceleration e.g md5, sha1, struct::set in tcllib all differ in +#whether they provide functions such as Loaded, Implementations, SwitchTo +set acceleratable [list sha1 md5] +lappend acceleratable {*}[lsearch -all -inline [package names] struct::*] +foreach p $acceleratable { + package forget $p + if {[string match struct::* $p]} { + catch {rename $p ""} + } +} +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + #2nd part of "check" if {$::punkboot::command eq "check"} { set sep [string repeat - 75] @@ -2103,6 +2126,7 @@ puts "---------------------" #how can we do this for runtimes from other platforms? #method1 try to mount as zip and kit - depends on current runtime to have mkzip - just because there is zip data doesn't mean the kit can mount it #method2 analyze executable to determine if its for another platform - then ask user and save answers in a config file.? +#mthod3 qemu? set runtime_caps [dict create] foreach runtime [dict keys $runtime_vfs_map] { set capscript { @@ -2355,7 +2379,7 @@ dict set path_cksum_cache {*}[punk::mix::base::lib::get_relativecksum_from_base # punkcheck allows us to not rely purely on timestamps (which may be unreliable) # foreach vfstail $vfs_tails { - + set vfsname [file rootname $vfstail] puts stdout " ------------------------------------" puts stdout " checking vfs $sourcefolder/vfs/$vfstail for configured runtimes" @@ -2372,21 +2396,25 @@ foreach vfstail $vfs_tails { #set runtimes [dict get $vfs_runtime_map $vfstail] #runtimes in vfs_runtime_map_vfs dict are unsuffixed (.exe stripped or was not present) set applist [dict get $vfs_runtime_map $vfstail] - foreach rt_app $applist { - lappend runtimes [lindex $rt_app 0] - } if {"windows" eq $::tcl_platform(platform)} { - set runtimes_raw $runtimes - set runtimes [list] - foreach rt $runtimes_raw { - if {![string match *.exe $rt] && $rt ne "-"} { - set rt $rt.exe + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {$rt ne "-" && [file exists $rtfolder/$rt.exe]} { + lappend runtimes $rt.exe + } + } + } else { + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {[file exists $rtfolder/$rt]} { + lappend runtimes $rt } - lappend runtimes $rt } } } else { #only match this vfs to a correspondingly named runtime if there was no explicit entry for that runtime + #review - should this only be done if there are NO explicit entries? how does a user stop builds of unneeded exes that match without renaming files/folders? + #but conversely, adding an extra entry shouldn't stop default builds that used to run.. set matchrt [file rootname [file tail $vfstail]] ;#e.g project.vfs -> project if {![dict exists $runtime_vfs_map $matchrt]} { if {"windows" eq $::tcl_platform(platform)} { @@ -2394,7 +2422,9 @@ foreach vfstail $vfs_tails { lappend runtimes $matchrt.exe } } else { - lappend runtimes $matchrt + if {[file exists $rtfolder/$matchrt]} { + lappend runtimes $matchrt + } } } } diff --git a/src/modules/punk/mix/base-0.1.tm b/src/modules/punk/mix/base-0.1.tm index 57f7f5a7..ea9fc85f 100644 --- a/src/modules/punk/mix/base-0.1.tm +++ b/src/modules/punk/mix/base-0.1.tm @@ -433,7 +433,7 @@ namespace eval punk::mix::base { set default_hash sha1 ;#but fall back to md5 if either sha1 is unavailable or unaccelerated (pure tcl sha1 is way slower than pure tcl md5 - can take minutes on even moderate sized source files) if {![catch {package require sha1}]} { set impls [::sha1::Implementations] - if {[llength $impls] == 1} { + if {[llength $impls] == 1 && [string tolower [lindex $impls 0]] eq "tcl"} { set default_hash md5 } } else { diff --git a/src/modules/punkcheck-0.1.0.tm b/src/modules/punkcheck-0.1.0.tm index 8f830844..4f1b6bb9 100644 --- a/src/modules/punkcheck-0.1.0.tm +++ b/src/modules/punkcheck-0.1.0.tm @@ -340,7 +340,9 @@ namespace eval punkcheck { set ts_now [clock microseconds] set metadata_us [expr {$ts_now - $ts_start}] - dict set installing_record -metadata_us $metadata_us + #?? + #JJJ + #dict set installing_record -metadata_us $metadata_us dict set installing_record -ts_start_transfer $ts_now lset fileinfo_body end $installing_record @@ -473,6 +475,7 @@ namespace eval punkcheck { set path_cksum_caches [list] } set o_fileset_record [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $rel_source_path $o_fileset_record $path_cksum_caches] + #JJJ - update -metadata_us here? } method targetset_last_complete {} { @@ -1618,13 +1621,23 @@ namespace eval punkcheck { continue } #puts stdout " checking file : $current_source_dir/$m" - set thismatchdir [file dirname $m] - if {$last_processed_dir ne $thismatchdir} { - puts stdout "\n checking files in [file join $current_source_dir $thismatchdir]" - set last_processed_dir $thismatchdir + #give some output - but not too deep + #set thismatchdir [file dirname [file dirname [file join $srcdir $relative_source_dir]]] + #if {$last_processed_dir ne $thismatchdir} { + # puts stdout "\n checking files in $thismatchdir" + # set last_processed_dir $thismatchdir + #} else { + # puts -nonewline stdout . + #} + if {$last_processed_dir ne $CALLDEPTH} { + puts -nonewline stdout \n[string repeat " " $CALLDEPTH][file tail $relative_source_dir] + puts -nonewline stdout \n[string repeat " " $CALLDEPTH]. + flush stdout + set last_processed_dir $CALLDEPTH } else { puts -nonewline stdout . } + set ts_start [clock microseconds] set seconds [expr {$ts_start / 1000000}] set ts_start_iso [clock format $seconds -format "%Y-%m-%dT%H:%M:%S"] @@ -1670,7 +1683,19 @@ namespace eval punkcheck { set ts2 [clock milliseconds] set diff [expr {$ts2 - $ts1}] if {$diff > 100} { - puts stderr "punkcheck: performance warning: fetch_metdata for $m took $diff ms" + set errprefix ">>> punkcheck:" + puts stderr "\n$errprefix performance warning: fetch_metadata for $m took $diff ms." + set lb [lindex [dict get $filerec body] end] + #puts stderr "$errprefix filerec last body record:$lb" + set records [dict get $lb body] + set lr [lindex $records end] + set alg [dict get $lr -cksum_all_opts -cksum_algorithm] + if {$alg eq "sha1"} { + puts stderr "$errprefix cksum_algorithm: sha1 (accelerators: [::sha1::Implementations])" + puts stderr "$errprefix sha1 from: [package ifneeded sha1 [package present sha1]]" + } else { + puts stderr "$errprefix cksum_algorithm: $alg" + } } @@ -1806,6 +1831,8 @@ namespace eval punkcheck { if {$max_depth != -1 && $CALLDEPTH >= $max_depth} { #don't process any more subdirs + #sometimes deliberately called with max_depth 1 - so don't warn here. review + #puts stderr "punkcheck::install warning - reached max_depth $max_depth" set subdirs [list] } else { set subdirs [glob -nocomplain -dir $current_source_dir -type d -tail *] @@ -1895,7 +1922,7 @@ namespace eval punkcheck { if {[llength $files_copied] || [llength $files_skipped]} { #puts stdout ">>>>>>>>>>>>>>>>>>>" set saveresult [punkcheck::save_records_to_file $punkcheck_records $punkcheck_file] - puts stdout "punkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" + puts stdout "\npunkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" #puts stdout ">>>>>>>>>>>>>>>>>>>" } else { #todo - write db INSTALLER record if -debug true diff --git a/src/project_layouts/custom/_project/punk.basic/src/make.tcl b/src/project_layouts/custom/_project/punk.basic/src/make.tcl index 835fee21..853185bf 100644 --- a/src/project_layouts/custom/_project/punk.basic/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.basic/src/make.tcl @@ -22,7 +22,7 @@ namespace eval ::punkboot { variable pkg_requirements [list]; variable pkg_missing [list];variable pkg_loaded [list] variable non_help_flags [list -k] variable help_flags [list -help --help /? -h] - variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate] + variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate ] } @@ -242,12 +242,12 @@ if {[file tail $startdir] eq "src"} { } } # ------------------------------------------------------------------------------------- +set original_tm_list [tcl::tm::list] +set original_auto_path $::auto_path set package_paths_modified 0 if {$bootsupport_paths_exist || $sourcesupport_paths_exist} { - set original_tm_list [tcl::tm::list] tcl::tm::remove {*}$original_tm_list - set original_auto_path $::auto_path #very basic test there is something there.. set support_contents_exist 0 @@ -1256,6 +1256,10 @@ if {$::punkboot::command eq "check"} { # - package path restore original module paths and auto_path entries to take effect in addition to bootsupport paths # - Order such that bootsupport entries are always higher priority (if same version number - prefer bootsupport) # - This must be done between the two "check" command sections + +# Ideally we would be running make.tcl purely from bootsupport packages - but binary packages such as Thread are required, +# and without accelerators the performance is abysmal (e.g minutes vs seconds for common sha1 operations) + if {$package_paths_modified} { set tm_list_boot [tcl::tm::list] tcl::tm::remove {*}$tm_list_boot @@ -1271,6 +1275,25 @@ if {$package_paths_modified} { lappend ::auto_path {*}$original_auto_path } # -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +#The problem with forcing our main packages to load only from bootsupport/sourcesupport is that if they require packages that use accelerators +# and no acceleration is available in bootsupport/sourcesupport - then they will stay unaccelerated even if the os packages provide it +# Also - some packages such as struct::set don't seem to handle reloading after forget (at least - not without destroying the command first) +#Packages that provide acceleration don't use a consistent API for testing acceleration e.g md5, sha1, struct::set in tcllib all differ in +#whether they provide functions such as Loaded, Implementations, SwitchTo +set acceleratable [list sha1 md5] +lappend acceleratable {*}[lsearch -all -inline [package names] struct::*] +foreach p $acceleratable { + package forget $p + if {[string match struct::* $p]} { + catch {rename $p ""} + } +} +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + #2nd part of "check" if {$::punkboot::command eq "check"} { set sep [string repeat - 75] @@ -2103,6 +2126,7 @@ puts "---------------------" #how can we do this for runtimes from other platforms? #method1 try to mount as zip and kit - depends on current runtime to have mkzip - just because there is zip data doesn't mean the kit can mount it #method2 analyze executable to determine if its for another platform - then ask user and save answers in a config file.? +#mthod3 qemu? set runtime_caps [dict create] foreach runtime [dict keys $runtime_vfs_map] { set capscript { @@ -2355,7 +2379,7 @@ dict set path_cksum_cache {*}[punk::mix::base::lib::get_relativecksum_from_base # punkcheck allows us to not rely purely on timestamps (which may be unreliable) # foreach vfstail $vfs_tails { - + set vfsname [file rootname $vfstail] puts stdout " ------------------------------------" puts stdout " checking vfs $sourcefolder/vfs/$vfstail for configured runtimes" @@ -2372,21 +2396,25 @@ foreach vfstail $vfs_tails { #set runtimes [dict get $vfs_runtime_map $vfstail] #runtimes in vfs_runtime_map_vfs dict are unsuffixed (.exe stripped or was not present) set applist [dict get $vfs_runtime_map $vfstail] - foreach rt_app $applist { - lappend runtimes [lindex $rt_app 0] - } if {"windows" eq $::tcl_platform(platform)} { - set runtimes_raw $runtimes - set runtimes [list] - foreach rt $runtimes_raw { - if {![string match *.exe $rt] && $rt ne "-"} { - set rt $rt.exe + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {$rt ne "-" && [file exists $rtfolder/$rt.exe]} { + lappend runtimes $rt.exe + } + } + } else { + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {[file exists $rtfolder/$rt]} { + lappend runtimes $rt } - lappend runtimes $rt } } } else { #only match this vfs to a correspondingly named runtime if there was no explicit entry for that runtime + #review - should this only be done if there are NO explicit entries? how does a user stop builds of unneeded exes that match without renaming files/folders? + #but conversely, adding an extra entry shouldn't stop default builds that used to run.. set matchrt [file rootname [file tail $vfstail]] ;#e.g project.vfs -> project if {![dict exists $runtime_vfs_map $matchrt]} { if {"windows" eq $::tcl_platform(platform)} { @@ -2394,7 +2422,9 @@ foreach vfstail $vfs_tails { lappend runtimes $matchrt.exe } } else { - lappend runtimes $matchrt + if {[file exists $rtfolder/$matchrt]} { + lappend runtimes $matchrt + } } } } diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm index 6b04827d..3c20391f 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm @@ -6439,6 +6439,8 @@ tcl::namespace::eval punk::ansi::class { set o_gx0states [list] set o_splitindex [list] + #sha1 takes *much* longer to compute than md5 if tcllibc not available - otherwise it is generally faster + #we should fall back to md5 if no acceleration available. check for command sha1::sha1c ? set o_cksum_command [list sha1::sha1 -hex] diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm index 7710fa00..ab1ca020 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm @@ -876,7 +876,7 @@ tcl::namespace::eval punk::args { set cache_key $args #ideally we would use a fast hash algorithm to produce a short key with low collision probability. - #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus) + #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus but terribly slow without an accelerator) #review - check if there is a built-into-tcl way to do this quickly #for now we will just key using the whole string #performance seems ok - memory usage probably not ideal diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm index 19d5177d..a4f56010 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm @@ -847,23 +847,24 @@ tcl::namespace::eval punk::libunknown { #keys are in reverse order due to tclPkgUnknown processing order set ordered_searchpaths [lreverse [dict keys $dict_added]];# ordered as in auto_path + #JN todo - look at tm epochs dict for {pkg versiond} $refresh_dict { set versions [dict keys $versiond] - puts stderr "---->pkg:$pkg versions: $versions" + #puts stderr "---->pkg:$pkg versions: $versions" foreach searchpath $ordered_searchpaths { set addedinfo [dict get $dict_added $searchpath] set vidx -1 foreach v $versions { incr vidx if {[dict exists $addedinfo $pkg $v]} { - ledit versions $vidx $vidx + ledit versions $vidx $vidx ;incr vidx -1 ;#maintain vidx as index into current state of $versions - not original state the foreach operates across. set iscript [dict get $addedinfo $pkg $v scr] #todo - find the iscript in the '$epoch pkg epochs added paths' lists and determine os vs dev vs internal #(scanning for path directly in the ifneeded script for pkgs is potentially error prone) #for .tm ifneeded scripts - the syntax is simple enough to determine directly (and ifneeded scr not stored for those anyway) set justaddedscript [package ifneeded $pkg $v] if {$justaddedscript ne $iscript} { - puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" + #puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" package ifneeded $pkg $v $iscript #dict set pkgvdone $pkg $v 1 } diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm index a4bc3c70..ea9fc85f 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm @@ -430,7 +430,16 @@ namespace eval punk::mix::base { #not just used by cksum_path. used by caller (e.g fill_relativecksums_from_base_and_relativepathdict via cksum_filter_opts) to determine what opt names passed through variable cksum_default_opts - set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm sha1] + set default_hash sha1 ;#but fall back to md5 if either sha1 is unavailable or unaccelerated (pure tcl sha1 is way slower than pure tcl md5 - can take minutes on even moderate sized source files) + if {![catch {package require sha1}]} { + set impls [::sha1::Implementations] + if {[llength $impls] == 1 && [string tolower [lindex $impls 0]] eq "tcl"} { + set default_hash md5 + } + } else { + set default_hash md5 + } + set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm $default_hash] proc cksum_default_opts {} { variable cksum_default_opts return $cksum_default_opts @@ -438,9 +447,9 @@ namespace eval punk::mix::base { #crc::cksum is extremely slow in tcllib as at 2023 e.g 20x slower (no c implementation?) # - try builtin zlib crc instead? - #sha1 is performant - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. + #sha1 is performant (when accelerator present) - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. #adler32 is fastest for some larger files of a few MB but slower on small files (possibly due to Tcl-based file load?) - #sha1 as at 2023 seems a reasonable default + #sha1 as at 2023 seems a reasonable default - (but only if accelerator present) proc cksum_algorithms {} { variable sha3_implementation #sha2 is an alias for sha256 diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm index 50bcc2f8..e0bb8b65 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm @@ -340,7 +340,9 @@ namespace eval punkcheck { set ts_now [clock microseconds] set metadata_us [expr {$ts_now - $ts_start}] - dict set installing_record -metadata_us $metadata_us + #?? + #JJJ + #dict set installing_record -metadata_us $metadata_us dict set installing_record -ts_start_transfer $ts_now lset fileinfo_body end $installing_record @@ -473,6 +475,7 @@ namespace eval punkcheck { set path_cksum_caches [list] } set o_fileset_record [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $rel_source_path $o_fileset_record $path_cksum_caches] + #JJJ - update -metadata_us here? } method targetset_last_complete {} { @@ -1599,6 +1602,7 @@ namespace eval punkcheck { #puts stdout "Current target dir: $current_target_dir" + set last_processed_dir "" foreach m $match_list { set new_tgt_cksum_info [list] set relative_target_path [file join $relative_target_dir $m] @@ -1617,6 +1621,23 @@ namespace eval punkcheck { continue } #puts stdout " checking file : $current_source_dir/$m" + #give some output - but not too deep + #set thismatchdir [file dirname [file dirname [file join $srcdir $relative_source_dir]]] + #if {$last_processed_dir ne $thismatchdir} { + # puts stdout "\n checking files in $thismatchdir" + # set last_processed_dir $thismatchdir + #} else { + # puts -nonewline stdout . + #} + if {$last_processed_dir ne $CALLDEPTH} { + puts -nonewline stdout \n[string repeat " " $CALLDEPTH][file tail $relative_source_dir] + puts -nonewline stdout \n[string repeat " " $CALLDEPTH]. + flush stdout + set last_processed_dir $CALLDEPTH + } else { + puts -nonewline stdout . + } + set ts_start [clock microseconds] set seconds [expr {$ts_start / 1000000}] set ts_start_iso [clock format $seconds -format "%Y-%m-%dT%H:%M:%S"] @@ -1657,7 +1678,25 @@ namespace eval punkcheck { #Note this isn't a recordlist function - so it doesn't purely operate on the records #this hits the filesystem for the sourcepath - gets checksums/timestamps depending on config. #It doesn't save to .punkcheck (the only punkcheck::installfile_ method which doesn't) + set ts1 [clock milliseconds] set filerec [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $relative_source_path $filerec] + set ts2 [clock milliseconds] + set diff [expr {$ts2 - $ts1}] + if {$diff > 100} { + set errprefix ">>> punkcheck:" + puts stderr "$errprefix performance warning: fetch_metadata for $m took $diff ms." + set lb [lindex [dict get $filerec body] end] + #puts stderr "$errprefix filerec last body record:$lb" + set records [dict get $lb body] + set lr [lindex $records end] + set alg [dict get $lr -cksum_all_opts -cksum_algorithm] + if {$alg eq "sha1"} { + puts "$errprefix cksum_algorithm: sha1 (accelerators: [::sha1::Implementations])" + puts "$errprefix sha1 from: [package ifneeded sha1 [package present sha1]]" + } else { + puts "$errprefix cksum_algorithm: $alg" + } + } @@ -1681,6 +1720,7 @@ namespace eval punkcheck { lappend files_copied $current_source_dir/$m } else { if {![file exists $current_target_dir/$m]} { + puts stderr "punkcheck: first copy to $current_target_dir/$m " file mkdir $current_target_dir file copy $current_source_dir/$m $current_target_dir set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] @@ -1691,9 +1731,15 @@ namespace eval punkcheck { installedsourcechanged-targets { if {[llength $changed]} { #An unrecorded installation is considered a source change (from unknown/unrecorded source to recorded) + puts -nonewline stderr "punkcheck: about to: file copy -force $current_source_dir/$m $current_target_dir" + set ts1 [clock milliseconds] file mkdir $current_target_dir file copy -force $current_source_dir/$m $current_target_dir + set ts2 [clock milliseconds] + puts -nonewline stderr " (copy time [expr {$ts2 - $ts1}] ms)" set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] + set ts3 [clock milliseconds] + puts stderr " (cksum time [expr {$ts2 - $ts1}] ms)" lappend files_copied $current_source_dir/$m } else { set is_skip 1 @@ -1785,6 +1831,8 @@ namespace eval punkcheck { if {$max_depth != -1 && $CALLDEPTH >= $max_depth} { #don't process any more subdirs + #sometimes deliberately called with max_depth 1 - so don't warn here. review + #puts stderr "punkcheck::install warning - reached max_depth $max_depth" set subdirs [list] } else { set subdirs [glob -nocomplain -dir $current_source_dir -type d -tail *] @@ -1874,7 +1922,7 @@ namespace eval punkcheck { if {[llength $files_copied] || [llength $files_skipped]} { #puts stdout ">>>>>>>>>>>>>>>>>>>" set saveresult [punkcheck::save_records_to_file $punkcheck_records $punkcheck_file] - puts stdout "punkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" + puts stdout "\npunkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" #puts stdout ">>>>>>>>>>>>>>>>>>>" } else { #todo - write db INSTALLER record if -debug true diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm index d9858980..c89b3594 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm @@ -95,6 +95,7 @@ tcl::namespace::eval textblock { #NOTE sha1, although computationally more intensive, tends to be faster than md5 on modern cpus #(more likely to be optimised for modern cpu features?) #(This speed improvement may not apply for short strings) + #This is probably only true if tcllibc is available - pure-tcl sha1 is excruciatingly slow... variable use_hash ;#framecache set use_hash none ;#slightly faster but uglier layout for viewing frame_cache display diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl b/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl index 835fee21..853185bf 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl @@ -22,7 +22,7 @@ namespace eval ::punkboot { variable pkg_requirements [list]; variable pkg_missing [list];variable pkg_loaded [list] variable non_help_flags [list -k] variable help_flags [list -help --help /? -h] - variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate] + variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate ] } @@ -242,12 +242,12 @@ if {[file tail $startdir] eq "src"} { } } # ------------------------------------------------------------------------------------- +set original_tm_list [tcl::tm::list] +set original_auto_path $::auto_path set package_paths_modified 0 if {$bootsupport_paths_exist || $sourcesupport_paths_exist} { - set original_tm_list [tcl::tm::list] tcl::tm::remove {*}$original_tm_list - set original_auto_path $::auto_path #very basic test there is something there.. set support_contents_exist 0 @@ -1256,6 +1256,10 @@ if {$::punkboot::command eq "check"} { # - package path restore original module paths and auto_path entries to take effect in addition to bootsupport paths # - Order such that bootsupport entries are always higher priority (if same version number - prefer bootsupport) # - This must be done between the two "check" command sections + +# Ideally we would be running make.tcl purely from bootsupport packages - but binary packages such as Thread are required, +# and without accelerators the performance is abysmal (e.g minutes vs seconds for common sha1 operations) + if {$package_paths_modified} { set tm_list_boot [tcl::tm::list] tcl::tm::remove {*}$tm_list_boot @@ -1271,6 +1275,25 @@ if {$package_paths_modified} { lappend ::auto_path {*}$original_auto_path } # -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +#The problem with forcing our main packages to load only from bootsupport/sourcesupport is that if they require packages that use accelerators +# and no acceleration is available in bootsupport/sourcesupport - then they will stay unaccelerated even if the os packages provide it +# Also - some packages such as struct::set don't seem to handle reloading after forget (at least - not without destroying the command first) +#Packages that provide acceleration don't use a consistent API for testing acceleration e.g md5, sha1, struct::set in tcllib all differ in +#whether they provide functions such as Loaded, Implementations, SwitchTo +set acceleratable [list sha1 md5] +lappend acceleratable {*}[lsearch -all -inline [package names] struct::*] +foreach p $acceleratable { + package forget $p + if {[string match struct::* $p]} { + catch {rename $p ""} + } +} +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + #2nd part of "check" if {$::punkboot::command eq "check"} { set sep [string repeat - 75] @@ -2103,6 +2126,7 @@ puts "---------------------" #how can we do this for runtimes from other platforms? #method1 try to mount as zip and kit - depends on current runtime to have mkzip - just because there is zip data doesn't mean the kit can mount it #method2 analyze executable to determine if its for another platform - then ask user and save answers in a config file.? +#mthod3 qemu? set runtime_caps [dict create] foreach runtime [dict keys $runtime_vfs_map] { set capscript { @@ -2355,7 +2379,7 @@ dict set path_cksum_cache {*}[punk::mix::base::lib::get_relativecksum_from_base # punkcheck allows us to not rely purely on timestamps (which may be unreliable) # foreach vfstail $vfs_tails { - + set vfsname [file rootname $vfstail] puts stdout " ------------------------------------" puts stdout " checking vfs $sourcefolder/vfs/$vfstail for configured runtimes" @@ -2372,21 +2396,25 @@ foreach vfstail $vfs_tails { #set runtimes [dict get $vfs_runtime_map $vfstail] #runtimes in vfs_runtime_map_vfs dict are unsuffixed (.exe stripped or was not present) set applist [dict get $vfs_runtime_map $vfstail] - foreach rt_app $applist { - lappend runtimes [lindex $rt_app 0] - } if {"windows" eq $::tcl_platform(platform)} { - set runtimes_raw $runtimes - set runtimes [list] - foreach rt $runtimes_raw { - if {![string match *.exe $rt] && $rt ne "-"} { - set rt $rt.exe + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {$rt ne "-" && [file exists $rtfolder/$rt.exe]} { + lappend runtimes $rt.exe + } + } + } else { + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {[file exists $rtfolder/$rt]} { + lappend runtimes $rt } - lappend runtimes $rt } } } else { #only match this vfs to a correspondingly named runtime if there was no explicit entry for that runtime + #review - should this only be done if there are NO explicit entries? how does a user stop builds of unneeded exes that match without renaming files/folders? + #but conversely, adding an extra entry shouldn't stop default builds that used to run.. set matchrt [file rootname [file tail $vfstail]] ;#e.g project.vfs -> project if {![dict exists $runtime_vfs_map $matchrt]} { if {"windows" eq $::tcl_platform(platform)} { @@ -2394,7 +2422,9 @@ foreach vfstail $vfs_tails { lappend runtimes $matchrt.exe } } else { - lappend runtimes $matchrt + if {[file exists $rtfolder/$matchrt]} { + lappend runtimes $matchrt + } } } } diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm index 6b04827d..3c20391f 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm @@ -6439,6 +6439,8 @@ tcl::namespace::eval punk::ansi::class { set o_gx0states [list] set o_splitindex [list] + #sha1 takes *much* longer to compute than md5 if tcllibc not available - otherwise it is generally faster + #we should fall back to md5 if no acceleration available. check for command sha1::sha1c ? set o_cksum_command [list sha1::sha1 -hex] diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm index 7710fa00..ab1ca020 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm @@ -876,7 +876,7 @@ tcl::namespace::eval punk::args { set cache_key $args #ideally we would use a fast hash algorithm to produce a short key with low collision probability. - #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus) + #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus but terribly slow without an accelerator) #review - check if there is a built-into-tcl way to do this quickly #for now we will just key using the whole string #performance seems ok - memory usage probably not ideal diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm index 19d5177d..a4f56010 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/libunknown-0.1.tm @@ -847,23 +847,24 @@ tcl::namespace::eval punk::libunknown { #keys are in reverse order due to tclPkgUnknown processing order set ordered_searchpaths [lreverse [dict keys $dict_added]];# ordered as in auto_path + #JN todo - look at tm epochs dict for {pkg versiond} $refresh_dict { set versions [dict keys $versiond] - puts stderr "---->pkg:$pkg versions: $versions" + #puts stderr "---->pkg:$pkg versions: $versions" foreach searchpath $ordered_searchpaths { set addedinfo [dict get $dict_added $searchpath] set vidx -1 foreach v $versions { incr vidx if {[dict exists $addedinfo $pkg $v]} { - ledit versions $vidx $vidx + ledit versions $vidx $vidx ;incr vidx -1 ;#maintain vidx as index into current state of $versions - not original state the foreach operates across. set iscript [dict get $addedinfo $pkg $v scr] #todo - find the iscript in the '$epoch pkg epochs added paths' lists and determine os vs dev vs internal #(scanning for path directly in the ifneeded script for pkgs is potentially error prone) #for .tm ifneeded scripts - the syntax is simple enough to determine directly (and ifneeded scr not stored for those anyway) set justaddedscript [package ifneeded $pkg $v] if {$justaddedscript ne $iscript} { - puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" + #puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" package ifneeded $pkg $v $iscript #dict set pkgvdone $pkg $v 1 } diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm index a4bc3c70..ea9fc85f 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/mix/base-0.1.tm @@ -430,7 +430,16 @@ namespace eval punk::mix::base { #not just used by cksum_path. used by caller (e.g fill_relativecksums_from_base_and_relativepathdict via cksum_filter_opts) to determine what opt names passed through variable cksum_default_opts - set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm sha1] + set default_hash sha1 ;#but fall back to md5 if either sha1 is unavailable or unaccelerated (pure tcl sha1 is way slower than pure tcl md5 - can take minutes on even moderate sized source files) + if {![catch {package require sha1}]} { + set impls [::sha1::Implementations] + if {[llength $impls] == 1 && [string tolower [lindex $impls 0]] eq "tcl"} { + set default_hash md5 + } + } else { + set default_hash md5 + } + set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm $default_hash] proc cksum_default_opts {} { variable cksum_default_opts return $cksum_default_opts @@ -438,9 +447,9 @@ namespace eval punk::mix::base { #crc::cksum is extremely slow in tcllib as at 2023 e.g 20x slower (no c implementation?) # - try builtin zlib crc instead? - #sha1 is performant - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. + #sha1 is performant (when accelerator present) - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. #adler32 is fastest for some larger files of a few MB but slower on small files (possibly due to Tcl-based file load?) - #sha1 as at 2023 seems a reasonable default + #sha1 as at 2023 seems a reasonable default - (but only if accelerator present) proc cksum_algorithms {} { variable sha3_implementation #sha2 is an alias for sha256 diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm index 50bcc2f8..e0bb8b65 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punkcheck-0.1.0.tm @@ -340,7 +340,9 @@ namespace eval punkcheck { set ts_now [clock microseconds] set metadata_us [expr {$ts_now - $ts_start}] - dict set installing_record -metadata_us $metadata_us + #?? + #JJJ + #dict set installing_record -metadata_us $metadata_us dict set installing_record -ts_start_transfer $ts_now lset fileinfo_body end $installing_record @@ -473,6 +475,7 @@ namespace eval punkcheck { set path_cksum_caches [list] } set o_fileset_record [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $rel_source_path $o_fileset_record $path_cksum_caches] + #JJJ - update -metadata_us here? } method targetset_last_complete {} { @@ -1599,6 +1602,7 @@ namespace eval punkcheck { #puts stdout "Current target dir: $current_target_dir" + set last_processed_dir "" foreach m $match_list { set new_tgt_cksum_info [list] set relative_target_path [file join $relative_target_dir $m] @@ -1617,6 +1621,23 @@ namespace eval punkcheck { continue } #puts stdout " checking file : $current_source_dir/$m" + #give some output - but not too deep + #set thismatchdir [file dirname [file dirname [file join $srcdir $relative_source_dir]]] + #if {$last_processed_dir ne $thismatchdir} { + # puts stdout "\n checking files in $thismatchdir" + # set last_processed_dir $thismatchdir + #} else { + # puts -nonewline stdout . + #} + if {$last_processed_dir ne $CALLDEPTH} { + puts -nonewline stdout \n[string repeat " " $CALLDEPTH][file tail $relative_source_dir] + puts -nonewline stdout \n[string repeat " " $CALLDEPTH]. + flush stdout + set last_processed_dir $CALLDEPTH + } else { + puts -nonewline stdout . + } + set ts_start [clock microseconds] set seconds [expr {$ts_start / 1000000}] set ts_start_iso [clock format $seconds -format "%Y-%m-%dT%H:%M:%S"] @@ -1657,7 +1678,25 @@ namespace eval punkcheck { #Note this isn't a recordlist function - so it doesn't purely operate on the records #this hits the filesystem for the sourcepath - gets checksums/timestamps depending on config. #It doesn't save to .punkcheck (the only punkcheck::installfile_ method which doesn't) + set ts1 [clock milliseconds] set filerec [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $relative_source_path $filerec] + set ts2 [clock milliseconds] + set diff [expr {$ts2 - $ts1}] + if {$diff > 100} { + set errprefix ">>> punkcheck:" + puts stderr "$errprefix performance warning: fetch_metadata for $m took $diff ms." + set lb [lindex [dict get $filerec body] end] + #puts stderr "$errprefix filerec last body record:$lb" + set records [dict get $lb body] + set lr [lindex $records end] + set alg [dict get $lr -cksum_all_opts -cksum_algorithm] + if {$alg eq "sha1"} { + puts "$errprefix cksum_algorithm: sha1 (accelerators: [::sha1::Implementations])" + puts "$errprefix sha1 from: [package ifneeded sha1 [package present sha1]]" + } else { + puts "$errprefix cksum_algorithm: $alg" + } + } @@ -1681,6 +1720,7 @@ namespace eval punkcheck { lappend files_copied $current_source_dir/$m } else { if {![file exists $current_target_dir/$m]} { + puts stderr "punkcheck: first copy to $current_target_dir/$m " file mkdir $current_target_dir file copy $current_source_dir/$m $current_target_dir set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] @@ -1691,9 +1731,15 @@ namespace eval punkcheck { installedsourcechanged-targets { if {[llength $changed]} { #An unrecorded installation is considered a source change (from unknown/unrecorded source to recorded) + puts -nonewline stderr "punkcheck: about to: file copy -force $current_source_dir/$m $current_target_dir" + set ts1 [clock milliseconds] file mkdir $current_target_dir file copy -force $current_source_dir/$m $current_target_dir + set ts2 [clock milliseconds] + puts -nonewline stderr " (copy time [expr {$ts2 - $ts1}] ms)" set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] + set ts3 [clock milliseconds] + puts stderr " (cksum time [expr {$ts2 - $ts1}] ms)" lappend files_copied $current_source_dir/$m } else { set is_skip 1 @@ -1785,6 +1831,8 @@ namespace eval punkcheck { if {$max_depth != -1 && $CALLDEPTH >= $max_depth} { #don't process any more subdirs + #sometimes deliberately called with max_depth 1 - so don't warn here. review + #puts stderr "punkcheck::install warning - reached max_depth $max_depth" set subdirs [list] } else { set subdirs [glob -nocomplain -dir $current_source_dir -type d -tail *] @@ -1874,7 +1922,7 @@ namespace eval punkcheck { if {[llength $files_copied] || [llength $files_skipped]} { #puts stdout ">>>>>>>>>>>>>>>>>>>" set saveresult [punkcheck::save_records_to_file $punkcheck_records $punkcheck_file] - puts stdout "punkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" + puts stdout "\npunkcheck::install [dict get $saveresult recordcount] records saved as [dict get $saveresult linecount] lines to $punkcheck_file copied: [llength $files_copied] skipped: [llength $files_skipped]" #puts stdout ">>>>>>>>>>>>>>>>>>>" } else { #todo - write db INSTALLER record if -debug true diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm index d9858980..c89b3594 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm @@ -95,6 +95,7 @@ tcl::namespace::eval textblock { #NOTE sha1, although computationally more intensive, tends to be faster than md5 on modern cpus #(more likely to be optimised for modern cpu features?) #(This speed improvement may not apply for short strings) + #This is probably only true if tcllibc is available - pure-tcl sha1 is excruciatingly slow... variable use_hash ;#framecache set use_hash none ;#slightly faster but uglier layout for viewing frame_cache display diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl b/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl index 835fee21..853185bf 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl @@ -22,7 +22,7 @@ namespace eval ::punkboot { variable pkg_requirements [list]; variable pkg_missing [list];variable pkg_loaded [list] variable non_help_flags [list -k] variable help_flags [list -help --help /? -h] - variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate] + variable known_commands [list project modules vfs info check shell vendorupdate bootsupport vfscommonupdate ] } @@ -242,12 +242,12 @@ if {[file tail $startdir] eq "src"} { } } # ------------------------------------------------------------------------------------- +set original_tm_list [tcl::tm::list] +set original_auto_path $::auto_path set package_paths_modified 0 if {$bootsupport_paths_exist || $sourcesupport_paths_exist} { - set original_tm_list [tcl::tm::list] tcl::tm::remove {*}$original_tm_list - set original_auto_path $::auto_path #very basic test there is something there.. set support_contents_exist 0 @@ -1256,6 +1256,10 @@ if {$::punkboot::command eq "check"} { # - package path restore original module paths and auto_path entries to take effect in addition to bootsupport paths # - Order such that bootsupport entries are always higher priority (if same version number - prefer bootsupport) # - This must be done between the two "check" command sections + +# Ideally we would be running make.tcl purely from bootsupport packages - but binary packages such as Thread are required, +# and without accelerators the performance is abysmal (e.g minutes vs seconds for common sha1 operations) + if {$package_paths_modified} { set tm_list_boot [tcl::tm::list] tcl::tm::remove {*}$tm_list_boot @@ -1271,6 +1275,25 @@ if {$package_paths_modified} { lappend ::auto_path {*}$original_auto_path } # -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- +#The problem with forcing our main packages to load only from bootsupport/sourcesupport is that if they require packages that use accelerators +# and no acceleration is available in bootsupport/sourcesupport - then they will stay unaccelerated even if the os packages provide it +# Also - some packages such as struct::set don't seem to handle reloading after forget (at least - not without destroying the command first) +#Packages that provide acceleration don't use a consistent API for testing acceleration e.g md5, sha1, struct::set in tcllib all differ in +#whether they provide functions such as Loaded, Implementations, SwitchTo +set acceleratable [list sha1 md5] +lappend acceleratable {*}[lsearch -all -inline [package names] struct::*] +foreach p $acceleratable { + package forget $p + if {[string match struct::* $p]} { + catch {rename $p ""} + } +} +# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- + + #2nd part of "check" if {$::punkboot::command eq "check"} { set sep [string repeat - 75] @@ -2103,6 +2126,7 @@ puts "---------------------" #how can we do this for runtimes from other platforms? #method1 try to mount as zip and kit - depends on current runtime to have mkzip - just because there is zip data doesn't mean the kit can mount it #method2 analyze executable to determine if its for another platform - then ask user and save answers in a config file.? +#mthod3 qemu? set runtime_caps [dict create] foreach runtime [dict keys $runtime_vfs_map] { set capscript { @@ -2355,7 +2379,7 @@ dict set path_cksum_cache {*}[punk::mix::base::lib::get_relativecksum_from_base # punkcheck allows us to not rely purely on timestamps (which may be unreliable) # foreach vfstail $vfs_tails { - + set vfsname [file rootname $vfstail] puts stdout " ------------------------------------" puts stdout " checking vfs $sourcefolder/vfs/$vfstail for configured runtimes" @@ -2372,21 +2396,25 @@ foreach vfstail $vfs_tails { #set runtimes [dict get $vfs_runtime_map $vfstail] #runtimes in vfs_runtime_map_vfs dict are unsuffixed (.exe stripped or was not present) set applist [dict get $vfs_runtime_map $vfstail] - foreach rt_app $applist { - lappend runtimes [lindex $rt_app 0] - } if {"windows" eq $::tcl_platform(platform)} { - set runtimes_raw $runtimes - set runtimes [list] - foreach rt $runtimes_raw { - if {![string match *.exe $rt] && $rt ne "-"} { - set rt $rt.exe + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {$rt ne "-" && [file exists $rtfolder/$rt.exe]} { + lappend runtimes $rt.exe + } + } + } else { + foreach rt_app $applist { + set rt [lindex $rt_app 0] + if {[file exists $rtfolder/$rt]} { + lappend runtimes $rt } - lappend runtimes $rt } } } else { #only match this vfs to a correspondingly named runtime if there was no explicit entry for that runtime + #review - should this only be done if there are NO explicit entries? how does a user stop builds of unneeded exes that match without renaming files/folders? + #but conversely, adding an extra entry shouldn't stop default builds that used to run.. set matchrt [file rootname [file tail $vfstail]] ;#e.g project.vfs -> project if {![dict exists $runtime_vfs_map $matchrt]} { if {"windows" eq $::tcl_platform(platform)} { @@ -2394,7 +2422,9 @@ foreach vfstail $vfs_tails { lappend runtimes $matchrt.exe } } else { - lappend runtimes $matchrt + if {[file exists $rtfolder/$matchrt]} { + lappend runtimes $matchrt + } } } } diff --git a/src/vendormodules/packageTest-0.1.0.tm b/src/vendormodules/packageTest-0.1.0.tm deleted file mode 100644 index befc864a..00000000 Binary files a/src/vendormodules/packageTest-0.1.0.tm and /dev/null differ diff --git a/src/vendormodules/packageTest-0.1.1.tm b/src/vendormodules/packageTest-0.1.1.tm deleted file mode 100644 index b84f3fc2..00000000 Binary files a/src/vendormodules/packageTest-0.1.1.tm and /dev/null differ diff --git a/src/vendormodules/packageTest-0.1.2.tm b/src/vendormodules/packageTest-0.1.2.tm deleted file mode 100644 index e69038af..00000000 Binary files a/src/vendormodules/packageTest-0.1.2.tm and /dev/null differ diff --git a/src/vendormodules/packageTest-0.1.3.tm b/src/vendormodules/packageTest-0.1.3.tm deleted file mode 100644 index 676e4048..00000000 Binary files a/src/vendormodules/packageTest-0.1.3.tm and /dev/null differ diff --git a/src/vendormodules/tomlish-1.1.2.tm b/src/vendormodules/tomlish-1.1.2.tm deleted file mode 100644 index c7da645b..00000000 --- a/src/vendormodules/tomlish-1.1.2.tm +++ /dev/null @@ -1,5680 +0,0 @@ -# -*- tcl -*- -# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt -# -# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. -# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# (C) 2024 -# -# @@ Meta Begin -# Application tomlish 1.1.2 -# Meta platform tcl -# Meta license -# @@ Meta End - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# doctools header -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -#*** !doctools -#[manpage_begin tomlish_module_tomlish 0 1.1.2] -#[copyright "2024"] -#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] -#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] -#[require tomlish] -#[keywords module parsing toml configuration] -#[description] -#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) -#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml -#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, -#[para] although these other formats are generally unlikely to retain whitespace or comments -#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. -#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions -#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key -#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) -#[para] will need a -type option (-force ?) to force overriding with another type such as an int. - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section Overview] -#[para] overview of tomlish -#[subsection Concepts] -#[para] - - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -## Requirements -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[subsection dependencies] -#[para] packages used by tomlish -#[list_begin itemized] - -package require Tcl 8.6- -package require struct::stack -package require logger - -#*** !doctools -#[item] [package {Tcl 8.6-}] -#[item] [package {struct::stack}] - -#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') - - -#*** !doctools -#[list_end] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section API] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Base namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish { - namespace export {[a-z]*}; # Convention: export all lowercase - variable types - - #IDEAS: - # since get_toml produces tomlish with whitespace/comments intact: - # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace - # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? - # - separate addKey?? - # - deleteKey (delete leaf) - # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) - # - set/add Table? - position in doc based on existing tables/subtables? - - #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. - #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n - #The newline is part of the keyval structure so makes reordering easier - #example from_toml "a=1\nb=2\n\n\n" - # 0 = TOMLISH - # 1 = KEY a = {INT 1} {NEWLINE lf} - # 2 = NEWLINE lf - # 3 = KEY b = {INT 2} {NEWLINE lf} - # 4 = NEWLINE lf - # 5 = NEWLINE lf - - - #ARRAY is analogous to a Tcl list - #TABLE is analogous to a Tcl dict - #WS = inline whitespace - #KEY = bare key and value - #QKEY = double quoted key and value ;#todo - rename to DQKEY? - #SQKEY = single quoted key and value - #ITABLE = inline table (*can* be anonymous table) - # inline table values immediately create a table with the opening brace - # inline tables are fully defined between their braces, as are dotted-key subtables defined within - # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained - - set tags [list TOMLISH ARRAY TABLE ITABLE ANONTABLE WS NEWLINE COMMENT DOTTEDKEY KEY QKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] - #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) - #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) - set min_int -9223372036854775808 ;#-2^63 - set max_int +9223372036854775807 ;#2^63-1 - - proc Dolog {lvl txt} { - #return "$lvl -- $txt" - set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" - puts stderr $msg - } - logger::initNamespace ::tomlish - foreach lvl [logger::levels] { - interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl - log::logproc $lvl tomlish_log_$lvl - } - - #*** !doctools - #[subsection {Namespace tomlish}] - #[para] Core API functions for tomlish - #[list_begin definitions] - - proc tags {} { - return $::tomlish::tags - } - - #helper function for to_dict - proc _get_keyval_value {keyval_element} { - log::notice ">>> _get_keyval_value from '$keyval_element'<<<" - set found_value 0 - #find the value - # 3 is the earliest index at which the value could occur (depending on whitespace) - set found_sub [list] - if {[lindex $keyval_element 2] ne "="} { - error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list" - } - foreach sub [lrange $keyval_element 2 end] { - #note that a barekey/quotedkey won't occur directly inside a barekey/quotedkey - switch -exact -- [lindex $sub 0] { - STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { - set type [lindex $sub 0] - set value [lindex $sub 1] - set found_sub $sub - incr found_value 1 - } - default {} - } - } - if {!$found_value} { - error "tomlish Failed to find value element in KEY. '$keyval_element'" - } - if {$found_value > 1} { - error "tomlish Found multiple value elements in KEY, expected exactly one. '$keyval_element'" - } - - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - #simple (non-container, no-substitution) datatype - set result [list type $type value $value] - } - STRING - STRINGPART { - set result [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL - LITERALPART { - #REVIEW - set result [list type $type value $value] - } - TABLE { - #invalid? - error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" - } - ITABLE { - #This one should not be returned as a type value structure! - # - set result [::tomlish::to_dict [list $found_sub]] - } - ARRAY { - #we need to recurse to get the corresponding dict for the contained item(s) - #pass in the whole $found_sub - not just the $value! - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - } - MULTISTRING - MULTILITERAL { - #review - mapping these to STRING might make some conversions harder? - #if we keep the MULTI - we know we have to look for newlines for example when converting to json - #without specific types we'd have to check every STRING - and lose info about how best to map chars within it - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - } - default { - error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" - } - } - return $result - } - - proc _get_dottedkey_info {dottedkeyrecord} { - set key_hierarchy [list] - set key_hierarchy_raw [list] - if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { - error "tomlish _get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" - } - set compoundkeylist [lindex $dottedkeyrecord 1] - set expect_sep 0 - foreach part $compoundkeylist { - set parttag [lindex $part 0] - if {$parttag eq "WS"} { - continue - } - if {$expect_sep} { - if {$parttag ne "DOTSEP"} { - error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" - } - set expect_sep 0 - } else { - set val [lindex $part 1] - switch -exact -- $parttag { - KEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw $val - } - QKEY { - lappend key_hierarchy [::tomlish::utils::unescape_string $val] - lappend key_hierarchy_raw \"$val\" - } - SQKEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw "'$val'" - } - default { - error "DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" - } - } - set expect_sep 1 - } - } - return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] - } - - - - #to_dict is a *basic* programmatic datastructure for accessing the data. - # produce a dictionary of keys and values from a tomlish tagged list. - # to_dict is primarily for reading toml data. - #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, - # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. - # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. - #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. - # - #Note, currently we get a plain sub dictionary when an inline table is a direct value for a key, but an ITABLE when it's in an ARRAY - REVIEW - #Namespacing? - #ie note the difference: - #[Data] - #temp = { cpu = 79.5, case = 72.0} - # versus - #[Data] - #temps = [{cpu = 79.5, case = 72.0}] - proc to_dict {tomlish} { - - #keep track of which tablenames have already been directly defined, - # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' - #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. - #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] - } - - log::info ">>> processing '$tomlish'<<<" - set items $tomlish - - foreach lst $items { - if {[lindex $lst 0] ni $::tomlish::tags} { - error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" - } - } - - if {[lindex $tomlish 0] eq "TOMLISH"} { - #ignore TOMLISH tag at beginning - set items [lrange $tomlish 1 end] - } - - set datastructure [dict create] - foreach item $items { - set tag [lindex $item 0] - #puts "...> item:'$item' tag:'$tag'" - switch -exact -- $tag { - KEY - QKEY - SQKEY { - log::debug "--> processing $tag: $item" - set key [lindex $item 1] - if {$tag eq "QKEY"} { - set key [::tomlish::utils::unescape_string $key] - } - #!todo - normalize key. (may be quoted/doublequoted) - - if {[dict exists $datastructure $key]} { - error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #lassign [_get_keyval_value $item] type val - set keyval_dict [_get_keyval_value $item] - dict set datastructure $key $keyval_dict - } - DOTTEDKEY { - log::debug "--> processing $tag: $item" - set dkey_info [_get_dottedkey_info $item] - set dotted_key_hierarchy [dict get $dkey_info keys] - set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] - - #a.b.c = 1 - #table_key_hierarchy -> a b - #leafkey -> c - if {[llength $dotted_key_hierarchy] == 0} { - #empty?? probably invalid. review - #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively - error "DOTTED key has no parts - invalid? '$item'" - } elseif {[llength $dotted_key_hierarchy] == 1} { - #dottedkey is only a key - no table component - set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] - } else { - set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set table_hierarchy_raw [lrange $dotted_key_hierarchy_raw 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] - } - - #ensure empty tables are still represented in the datastructure - set pathkeys [list] - foreach k $table_hierarchy { - lappend pathkeys $k - if {![dict exists $datastructure {*}$pathkeys]} { - dict set datastructure {*}$pathkeys [list] - } else { - tomlish::log::notice "to_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" - } - } - - set keyval_dict [_get_keyval_value $item] - dict set datastructure {*}$pathkeys $leafkey $keyval_dict - - #JMN test 2025 - - } - TABLE { - set tablename [lindex $item 1] - set tablename [::tomlish::utils::tablename_trim $tablename] - - if {$tablename in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - - log::debug "--> processing $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_hierarchy [list] - set table_key_hierarchy_raw [list] - - foreach rawseg $name_segments { - - set seg [::tomlish::utils::normalize_key $rawseg] ;#strips one level of enclosing quotes, and substitutes only toml-specified escapes - set c1 [tcl::string::index $rawseg 0] - set c2 [tcl::string::index $rawseg end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes are processed within it. - set seg [tcl::string::range $rawseg 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - set seg [::tomlish::utils::unescape_string [tcl::string::range $rawseg 1 end-1]] - } else { - set seg $rawseg - } - - #no need to check for empty segments here - we've already called tablename_split which would have raised an error for empty segments. - #if {$rawseg eq ""} { - # error "Table name '[lindex $item 1]' is not valid. All segments (parts between dots) must be non-empty" - #} - lappend table_key_hierarchy $seg - lappend table_key_hierarchy_raw $rawseg - - if {[dict exists $datastructure {*}$table_key_hierarchy]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should bfail on encoungerint table.x.y because only table and table.x are effectively tables - ## - we should also fail if - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #TODO! fix - this code is wrong - - set testkey [join $table_key_hierarchy_raw .] - - set testkey_length [llength $table_key_hierarchy_raw] - set found_testkey 0 - if {$testkey in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen $tablenames_seen { - set seen_segments [::tomlish::utils::tablename_split $seen] - #these are raw unnormalized tablename segments. Need to normalize the double-quoted ones, - # and strip the quotes from both single-quoted and double-quoted entries. - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - - #VVV the test below is wrong VVV! - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set seen_match [join [lrange $seen_segments 0 [expr {$testkey_length -1}]] .] - puts stderr "testkey:'$testkey' vs seen_match:'$seen_match'" - if {$testkey eq $seen_match} { - set found_testkey 1 - } - } - } - - if {$found_testkey == 0} { - #the raw table_key_hierarchy is better to display in the error message, although it's not the actual dict keyset - set msg "key [join $table_key_hierarchy_raw .] already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } - error $msg - } - } - - } - - #ensure empty tables are still represented in the datastructure - set table_keys [list] - foreach k $table_key_hierarchy { - lappend table_keys $k - if {![dict exists $datastructure {*}$table_keys]} { - dict set datastructure {*}$table_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $table_keys already had data: [dict get $datastructure {*}$table_keys]" - } - } - - #We must do this after the key-collision test above! - lappend tablenames_seen $tablename - - - log::debug ">>>>>>>>>>>>>>>>>>>>table_key_hierarchy : $table_key_hierarchy" - log::debug ">>>>>>>>>>>>>>>>>>>>table_key_hierarchy_raw: $table_key_hierarchy_raw" - - #now add the contained elements - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "--> $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [_get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' y} - #e.g2 keys {x.y y} keys_raw {{"x.y"} y} - set dotted_key_hierarchy [dict get $dkey_info keys] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - set leaf_key [lindex $dotted_key_hierarchy end] - set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] - set dkeys_raw [lrange $dotted_key_hierarchy_raw 0 end-1] - set leaf_key_raw [lindex $dotted_key_hierarchy_raw end] - - #ensure empty keys are still represented in the datastructure - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout ">>> $keyval_dict" - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - #JMN 2025 - #tomlish::utils::normalize_key ?? - lappend tablenames_seen [join [list {*}$table_key_hierarchy_raw {*}$dkeys_raw] .] ;#???? - #if the keyval_dict is not a simple type x value y - then it's an inline table ? - #if so - we should add the path to the leaf_key as a seen table too - as it's not allowed to have more entries added. - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [join [list {*}$table_key_hierarchy_raw {*}$dkeys_raw $leaf_key_raw] .] - } - - } - KEY - QKEY - SQKEY { - #obsolete ? - set keyval_key [lindex $element 1] - if {$type eq "QKEY"} { - set keyval_key [::tomlish::utils::unescape_string $keyval_key] - } - if {[dict exists $datastructure {*}$dotted_key_hierarchy $keyval_key]} { - error "Duplicate key '$dotted_key_hierarchy $key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$dotted_key_hierarchy $keyval_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,QKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - #now make sure we add an empty value if there were no contained elements! - #!todo. - } - ITABLE { - #SEP??? - set datastructure [list] - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [_get_dottedkey_info $element] - set dotted_key_hierarchy [dict get $dkey_info keys] - set dotted_key_hierarchy_raw [dict get $dkey_info keys_raw] - set leaf_key [lindex $dotted_key_hierarchy end] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - - #ensure empty keys are still represented in the datastructure - set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,QKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - } - ARRAY { - #arrays in toml are allowed to contain mixtures of types - set datastructure [list] - log::debug "--> processing array: $item" - - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - STRING { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - ITABLE - TABLE - ARRAY - MULTISTRING - MULTILITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - } - WS - SEP - NEWLINE - COMMENT { - #ignore whitespace, commas, newlines and comments - } - default { - error "Unexpected value type '$type' found in array" - } - } - } - } - MULTILITERAL { - #triple squoted string - #first newline stripped only if it is the very first element - #(ie *immediately* following the opening delims) - #All whitespace other than newlines is within LITERALPARTS - # ------------------------------------------------------------------------- - #todo - consider extension to toml to allow indent-aware multiline literals - # how - propose as issue in toml github? Use different delim? e.g ^^^ ? - #e.g - # xxx=?'''abc - # def - # etc - # ''' - # - we would like to trimleft each line to the column following the opening delim - # ------------------------------------------------------------------------- - - log::debug "--> processing multiliteral: $item" - set parts [lrange $item 1 end] - if {[lindex $parts 0 0] eq "NEWLINE"} { - set parts [lrange $parts 1 end] ;#skip it - } - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - switch -exact -- $type { - LITERALPART { - append stringvalue [lindex $element 1] - } - NEWLINE { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - MULTISTRING { - #triple dquoted string - log::debug "--> processing multistring: $item" - set stringvalue "" - set idx 0 - set parts [lrange $item 1 end] - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted - switch -exact -- $type { - STRING { - #todo - do away with STRING ? - #we don't build MULTISTRINGS containing STRING - but should we accept it? - tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" - append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" - } - STRINGPART { - append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] - } - CONT { - #When the last non-whitespace character on a line is an unescaped backslash, - #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter - # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - append stringvalue "\\" ;#add the sep - } else { - #skip over ws without emitting - set idx [llength $parts] - } - } else { - set parts_til_nl [lrange $parts 0 $next_nl-1] - set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] - if {$non_ws >= 0} { - append stringvalue "\\" - } else { - #skip over ws on this line - set idx $next_nl - #then have to check each subsequent line until we get to first non-whitespace - set trimming 1 - while {$trimming && $idx < [llength $parts]} { - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - } else { - set idx [llength $parts] - } - set trimming 0 - } else { - set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - set trimming 0 - } else { - set idx $next_nl - #keep trimming - } - } - } - } - } - } - NEWLINE { - #if newline is first element - it is not part of the data of a multistring - if {$idx > 0} { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - } - WS { - append stringvalue [lindex $element 1] - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - WS - COMMENT - NEWLINE { - #ignore - } - default { - error "Unexpected tag '$tag' in Tomlish list '$tomlish'" - } - } - } - return $datastructure - } - - - proc _from_dictval_tomltype {parents tablestack keys typeval} { - set type [dict get $typeval type] - set val [dict get $typeval value] - switch -- $type { - ARRAY { - set subitems [list] - foreach item $val { - lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP - } - if {[lindex $subitems end] eq "SEP"} { - set subitems [lrange $subitems 0 end-1] - } - return [list ARRAY {*}$subitems] - } - ITABLE { - if {$val eq ""} { - return ITABLE - } else { - return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] - } - } - MULTISTRING { - #value is a raw string that isn't encoded as tomlish - #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format - set tomlpart "x=\"\"\"\\\n" - append tomlpart $val "\"\"\"" - set tomlish [tomlish::decode::toml $tomlpart] - #e.g if val = " etc\nblah" - #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } - #lindex 1 3 is the MULTISTRING tomlish list - return [lindex $tomlish 1 3] - } - MULTILITERAL { - set tomlpart "x='''\n" - append tomlpart $val ''' - set tomlish [tomlish::decode::toml $tomlpart] - return [lindex $tomlish 1 3] - } - default { - return [list $type $val] - } - } - } - - proc _from_dictval {parents tablestack keys vinfo} { - set k [lindex $keys end] - if {[regexp {\s} $k] || [string first . $k] >= 0} { - #Any dot in the key would have been split by to_dict - so if it's present here it's part of this key - not a level separator! - #todo - QKEY? - set K_PART [list SQKEY $k] - } else { - set K_PART [list KEY $k] - } - puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" - puts stderr "---tablestack: $tablestack---" - set result [list] - set lastparent [lindex $parents end] - if {$lastparent in [list "" do_inline]} { - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - set type [dict get $vinfo type] - #treat ITABLE differently? - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} - } else { - #set result [list TABLE $k {NEWLINE lf}] - if {$vinfo ne ""} { - - #set result [list DOTTEDKEY [list [list KEY $k]] = ] - #set records [list ITABLE] - - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - - if {$lastparent eq "do_inline"} { - set result [list DOTTEDKEY [list $K_PART] =] - set records [list ITABLE] - } else { - #review - quoted k ?? - set result [list TABLE $k {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $k]] - set records [list] - } - - - - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - dict for {vk vv} $vinfo { - if {[regexp {\s} $vk] || [string first . $vk] >= 0} { - set VK_PART [list SQKEY $vk] - } else { - set VK_PART [list KEY $vk] - } - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] - } else { - if {$vv eq ""} { - #experimental - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" - #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] - set tname [join [list {*}$keys $vk] .] - set record [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - } else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - set tablestack [list {*}$tablestack [list I $vk]] - } - } else { - if { 0 } { - #experiment.. sort of getting there. - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" - set tname [join [list {*}$keys $vk] .] - set record [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - - #review - todo? - set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] - lappend record {*}$dottedkey_value - - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - } - if {$dictidx != $lastidx} { - #lappend record SEP - if {$lastparent eq "do_inline"} { - lappend record SEP - } else { - lappend record {NEWLINE lf} - } - } - lappend records $record - incr dictidx - } - if {$lastparent eq "do_inline"} { - lappend result $records {NEWLINE lf} - } else { - lappend result {*}$records {NEWLINE lf} - } - } else { - if {$lastparent eq "do_inline"} { - lappend result DOTTEDKEY [list [list KEY $k]] = ITABLE {NEWLINE lf} - } else { - lappend result TABLE $k {NEWLINE lf} - } - } - } - } else { - #lastparent is not toplevel "" or "do_inline" - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result {*}$sublist - } else { - if {$lastparent eq "TABLE"} { - #review - dict for {vk vv} $vinfo { - set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] - lappend result [list DOTTEDKEY [list [list KEY $vk]] = $dottedkey_value {NEWLINE lf}] - } - } else { - if {$vinfo ne ""} { - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - set sub [list] - set result $lastparent ;#e.g sets ITABLE - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - dict for {vk vv} $vinfo { - if {[regexp {\s} $vk] || [string first . $vk] >=0} { - set VK_PART [list SQKEY $vk] - } else { - set VK_PART [list KEY $vk] - } - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART] = $sublist] - } else { - if {$vv eq ""} { - #can't just uninline at this level - #we need a better method to query main dict for uninlinability at each level - # (including what's been inlined already) - #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - # puts stderr "_from_dictval uninline2 KEY $keys" - # set tname [join [list {*}$keys $vk] .] - # set record [list TABLE $tname {NEWLINE lf}] - # set tablestack [list {*}$tablestack [list T $vk]] - #} else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - #} - } else { - #set sub [_from_dictval ITABLE $vk $vv] - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - if {$dictidx != $lastidx} { - lappend record SEP - } - lappend result $record - incr dictidx - } - } else { - puts stderr "table x-1" - lappend result DOTTEDKEY [list $K_PART] = ITABLE - } - } - } - } - return $result - } - - - proc from_dict {d} { - #consider: - # t1={a=1,b=2} - # x = 1 - #If we represent t1 as an expanded table we get - # [t1] - # a=1 - # b=2 - # x=1 - # --- which is incorrect - as x was a toplevel key like t1! - #This issue doesn't occur if x is itself an inline table - # t1={a=1,b=2} - # x= {no="problem"} - # - # (or if we were to reorder x to come before t1) - - #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} - #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, - #which is unpreferred here. - - #A possible solution: - #scan the top level to see if all (trailing) elements are themselves dicts - # (ie not of form {type XXX value yyy}) - # - # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements - #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - - #set root_has_values 0 - #approach 1) - the naive approach - forces inline when not always necessary - #dict for {k v} $d { - # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { - # set root_has_values 1 - # break - # } - #} - - - #approach 2) - track the position of last {type x value y} in the dictionary built by to_dict - # - still not perfect. Inlines dotted tables unnecessarily - #This means from_dict doesn't produce output optimal for human editing. - set last_simple [tomlish::dict::last_tomltype_posn $d] - - - ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values - #Any keys that are themselves tables - will need to be represented inline - #to avoid reordering, or incorrect assignment of plain values to the wrong table. - - ## set parent "" - #all toplevel keys in the dict structure can represent subtables. - #we are free to use {[tablename]\n} syntax for toplevel elements. - - - set tomlish [list TOMLISH] - set dictposn 0 - set tablestack [list [list T root]] ;#todo - dict for {t tinfo} $d { - if {$last_simple > $dictposn} { - set parents [list do_inline] - } else { - set parents [list ""] - } - set keys [list $t] - set trecord [_from_dictval $parents $tablestack $keys $tinfo] - lappend tomlish $trecord - incr dictposn - } - return $tomlish - } - - proc json_to_toml {json} { - #*** !doctools - #[call [fun json_to_toml] [arg json]] - #[para] - - set tomlish [::tomlish::from_json $json] - set toml [::tomlish::to_toml $tomlish] - } - - #TODO use huddle? - proc from_json {json} { - set jstruct [::tomlish::json_struct $json] - return [::tomlish::from_json_struct $jstruct] - } - - proc from_json_struct {jstruct} { - package require fish::json_toml - return [fish::json_toml::jsonstruct2tomlish $jstruct] - } - - proc toml_to_json {toml} { - set tomlish [::tomlish::from_toml $toml] - return [::tomlish::get_json $tomlish] - } - - proc get_json {tomlish} { - package require fish::json - set d [::tomlish::to_dict $tomlish] - #return [::tomlish::dict_to_json $d] - return [fish::json::from "struct" $d] - } - - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - - -namespace eval tomlish::encode { - #*** !doctools - #[subsection {Namespace tomlish::encode}] - #[para] - #[list_begin definitions] - - #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness - # take a value of the appropriate type and wrap as a tomlish tagged item - proc string {s} { - return [list STRING $s] - } - - proc int {i} { - #whole numbers, may be prefixed with a + or - - #Leading zeros are not allowed - #Hex,octal binary forms are allowed (toml 1.0) - #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) - #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. - # - We should probably raise an error for number larger than this and suggest the user supply it as a string? - if {[tcl::string::last , $i] > -1} { - error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" - } - if {![::tomlish::utils::int_validchars $i]} { - error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" - } - - if {[::tomlish::utils::is_int $i]} { - return [list INT $i] - } else { - error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" - } - - } - - proc float {f} { - #convert any non-lower case variants of special values to lowercase for Toml - if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { - return [list FLOAT [tcl::string::tolower $f]] - } - if {[::tomlish::utils::is_float $f]} { - return [list FLOAT $f] - } else { - error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" - } - } - - proc datetime {str} { - if {[::tomlish::utils::is_datetime $str]} { - return [list DATETIME $str] - } else { - error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" - } - } - - proc boolean {b} { - #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false - if {![tcl::string::is boolean -strict $b]} { - error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" - } else { - if {[expr {$b && 1}]} { - return [::list BOOL true] - } else { - return [::list BOOL false] - } - } - } - - - #TODO - #Take tablename followed by - # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} - # (accept also key value {STRING }) - # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types - proc table {name args} { - set pairs [list] - foreach t $args { - if {[llength $t] == 4} { - if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { - error "Only items tagged as KEY = currently accepted as name-value pairs for table command" - } - lassign $t _k keystr _eq valuepart - if {[llength $valuepart] != 2} { - error "supplied value must be typed. e.g {INT 1} or {STRING test}" - } - lappend pairs [list KEY $keystr = $valuepart] - } elseif {[llength $t] == 2} { - #!todo - type heuristics - lassign $t n v - lappend pairs [list KEY $n = [list STRING $v]] - } else { - error "'KEY = { toml but - # the first newline is not part of the data. - # we elect instead to maintain a basic LITERALPART that must not contain newlines.. - # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, - #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. - set literal "" - foreach part [lrange $item 1 end] { - append literal [::tomlish::encode::tomlish [list $part] $nextcontext] - } - append toml '''$literal''' - } - INT - - BOOL - - FLOAT - - DATETIME { - append toml [lindex $item 1] - } - INCOMPLETE { - error "cannot process tomlish term tagged as INCOMPLETE" - } - COMMENT { - append toml "#[lindex $item 1]" - } - BOM { - #Byte Order Mark may appear at beginning of a file. Needs to be preserved. - append toml "\uFEFF" - } - default { - error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." - } - } - - } - return $toml - } - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] -} -#fish toml from tomlish - -#(encode tomlish as toml) -interp alias {} tomlish::to_toml {} tomlish::encode::tomlish - -# - - -namespace eval tomlish::decode { - #*** !doctools - #[subsection {Namespace tomlish::decode}] - #[para] - #[list_begin definitions] - - #return a Tcl list of tomlish tokens - #i.e get a standard list of all the toml terms in string $s - #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. - #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) - # ---------------------------------------------------------------------------------------------- - # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! - # e.g we deliberately don't check certain things such as duplicate table declarations here. - # ---------------------------------------------------------------------------------------------- - #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. - # (e.g perhaps a toml editor to highlight violations for fixing) - # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. - # e.g dicts or an object oriented structure - #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage - #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc - #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. - # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) - #If we were to unescape a tab character for example - # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. - # For this reason, we also do absolutely no line-ending transformations based on platform. - # All line-endings are maintained as is, and even a file with mixed cr crlf line-endings will be correctly interpreted and can be 'roundtripped' - - proc toml {args} { - #*** !doctools - #[call [fun toml] [arg arg...]] - #[para] return a Tcl list of tomlish tokens - - set s [join $args \n] - - namespace upvar ::tomlish::parse is_parsing is_parsing - set is_parsing 1 - - - if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { - tomlish::parse::spacestack destroy - } - struct::stack ::tomlish::parse::spacestack - - namespace upvar ::tomlish::parse last_space_action last_space_action - namespace upvar ::tomlish::parse last_space_type last_space_type - - - namespace upvar ::tomlish::parse tok tok - set tok "" - - namespace upvar ::tomlish::parse type type - namespace upvar ::tomlish::parse tokenType tokenType - ::tomlish::parse::set_tokenType "" - namespace upvar ::tomlish::parse tokenType_list tokenType_list - set tokenType [list] ;#Flat (un-nested) list of tokentypes found - - namespace upvar ::tomlish::parse lastChar lastChar - set lastChar "" - - - set result "" - namespace upvar ::tomlish::parse nest nest - set nest 0 - - namespace upvar ::tomlish::parse v v ;#array keyed on nest level - - - set v(0) {TOMLISH} - array set s0 [list] ;#whitespace data to go in {SPACE {}} element. - set parentlevel 0 - - namespace upvar ::tomlish::parse i i - set i 0 - - namespace upvar ::tomlish::parse state state - - namespace upvar ::tomlish::parse braceCount braceCount - set barceCount 0 - namespace upvar ::tomlish::parse bracketCount bracketCount - set bracketCount 0 - - set sep 0 - set r 1 - namespace upvar ::tomlish::parse token_waiting token_waiting - set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. - - - set state "table-space" - ::tomlish::parse::spacestack push {type space state table-space} - namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) - set linenum 1 - - set ::tomlish::parse::state_list [list] - try { - while {$r} { - set r [::tomlish::parse::tok $s] - #puts stdout "got tok: '$tok' while parsing string '$s' " - set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' - - - - #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" - #puts "-->tok: $tok tokenType='$tokenType'" - set prevstate $state - set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] - #review goNextState could perform more than one space_action - set space_action [dict get $transition_info space_action] - set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below - - if {[tcl::string::match "err-*" $state]} { - ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" - lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] - return $v(0) - } - # --------------------------------------------------------- - #NOTE there may already be a token_waiting at this point - #set_token_waiting can raise an error here, - # in which case the space_action branch needs to be rewritten to handle the existing token_waiting - # --------------------------------------------------------- - - if {$space_action eq "pop"} { - #pop_trigger_tokens: newline tablename endarray endinlinetable - #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. - set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append - switch -exact -- $tokenType { - squote_seq { - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the xxx-squote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - switch -- $tok { - ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] - } - '' { - #review - we should perhaps return double_squote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] - } - ''' { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] - } - '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop - } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } - } - default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" - } - } - } - ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop - } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } - } - default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" - } - } - } - } - puts "---- HERE squote_seq pop <$tok>" - } - triple_squote { - #presumably popping multiliteral-space - ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTILITERAL { - lappend merged $part - } - LITERALPART { - if {$lasttype eq "LITERALPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - equal { - if {$prevstate eq "dottedkey-space"} { - tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - tablename { - #note: a tablename only 'pops' if we are greater than zero - error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" - } - tablearrayname { - #!review - tablearrayname different to tablename regarding push/pop? - #note: a tablename only 'pops' if we are greater than zero - error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" - } - endarray { - #nothing to do here. - } - comma { - #comma for inline table will pop the keyvalue space - lappend v($nest) "SEP" - } - endinlinetable { - ::tomlish::log::debug "---- endinlinetable for last_space_action pop" - } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } - default { - error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" - } - } - if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here - lappend v($parentlevel) [set v($nest)] - } - - incr nest -1 - - } elseif {$last_space_action eq "push"} { - set prevnest $nest - incr nest 1 - set v($nest) [list] - # push_trigger_tokens: barekey quotedkey startinlinetable startarray tablename tablearrayname - - - switch -exact -- $tokenType { - squote_seq_begin { - #### - if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { - lassign [dict get $transition_info starttok] starttok_type starttok_val - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType $starttok_type - set tok $starttok_val - } - } - squotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - barekey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - set waiting [tomlish::parse::get_token_waiting] - if {[llength $waiting]} { - set i [dict get $waiting startindex] - tomlish::parse::clear_token_waiting - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } else { - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - } - startsquote { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - quotedkey - itablequotedkey { - set v($nest) [list QKEY $tok] ;#$tok is the keyname - } - itablesquotedkey { - set v($nest) [list SQKEY $tok] ;#$tok is the keyname - } - tablename { - #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! - #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish - # back to toml file will be identical. - #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. - # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. - - #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, - # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the - # tomlish list? - - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" - set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name - #note also that equivalent tablenames may have different toml representations even after being trimmed! - #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) - #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. - } - tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" - set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name - } - startarray { - set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. - } - startinlinetable { - set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. - } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } - default { - error "---- push trigger tokenType '$tokenType' not yet implemented" - } - } - - } else { - #no space level change - switch -exact -- $tokenType { - squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" - lappend v($nest) [list SQKEY $tok] - } - barekey { - lappend v($nest) [list KEY $tok] - } - dotsep { - lappend v($nest) [list DOTSEP] - } - starttablename { - #$tok is triggered by the opening bracket and sends nothing to output - } - starttablearrayname { - #$tok is triggered by the double opening brackets and sends nothing to output - } - tablename - tablenamearray { - error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" - #set v($nest) [list TABLE $tok] - } - endtablename - endtablearrayname { - #no output into the tomlish list for this token - } - startinlinetable { - puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" - } - startquote { - switch -exact -- $newstate { - string-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "string" - set tok "" - } - quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "quotedkey" - set tok "" - } - itable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" - } - default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - startsquote { - switch -exact -- $newstate { - literal-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "literal" - set tok "" - } - squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - itable-squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablesquotedkey" - set tok "" - } - multiliteral-space { - #false alarm squote returned from squote_seq pop - ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" - #(single squote - not terminating space) - lappend v($nest) [list LITERALPART '] - #may need to be joined on pop if there are neighbouring LITERALPARTs - } - default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } - double_squote { - switch -exact -- $prevstate { - keyval-value-expected { - lappend v($nest) [list LITERAL ""] - } - multiliteral-space { - #multiliteral-space to multiliteral-space - lappend v($nest) [list LITERALPART ''] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - multistring { - #review - lappend v($nest) [list MULTISTRING $tok] - } - stringpart { - lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly - } - multiliteral { - lappend v($nest) [LIST MULTILITERAL $tok] - } - literalpart { - lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - quotedkey { - #lappend v($nest) [list QKEY $tok] ;#TEST - } - itablequotedkey { - - } - untyped_value { - #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. - if {$tok in {true false}} { - set tag BOOL - } elseif {[::tomlish::utils::is_int $tok]} { - set tag INT - } elseif {[::tomlish::utils::is_float $tok]} { - set tag FLOAT - } elseif {[::tomlish::utils::is_datetime $tok]} { - set tag DATETIME - } else { - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" - } - lappend v($nest) [list $tag $tok] - - } - comment { - #puts stdout "----- comment token returned '$tok'------" - lappend v($nest) [list COMMENT "$tok"] - } - equal { - #we append '=' to the nest so that any surrounding whitespace is retained. - lappend v($nest) = - } - comma { - lappend v($nest) SEP - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - whitespace { - lappend v($nest) [list WS $tok] - } - continuation { - lappend v($nest) CONT - } - bom { - lappend v($nest) BOM - } - eof { - #ok - nothing more to add to the tomlish list. - #!todo - check previous tokens are complete/valid? - } - default { - error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - - if {!$next_tokenType_known} { - ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" - ::tomlish::parse::set_tokenType "" - set tok "" - } - - if {$state eq "end-state"} { - break - } - - - } - - #while {$nest > 0} { - # lappend v([expr {$nest -1}]) [set v($nest)] - # incr nest -1 - #} - while {[::tomlish::parse::spacestack size] > 1} { - ::tomlish::parse::spacestack pop - lappend v([expr {$nest -1}]) [set v($nest)] - incr nest -1 - - #set parent [spacestack peek] ;#the level being appended to - #lassign $parent type state - #if {$type eq "space"} { - # - #} elseif {$type eq "buffer"} { - # lappend v([expr {$nest -1}]) {*}[set v($nest)] - #} else { - # error "invalid spacestack item: $parent" - #} - } - - } finally { - set is_parsing 0 - } - return $v(0) - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] -} -#decode toml to tomlish -interp alias {} tomlish::from_toml {} tomlish::decode::toml - -namespace eval tomlish::utils { - #*** !doctools - #[subsection {Namespace tomlish::utils}] - #[para] - #[list_begin definitions] - - - #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace - # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] - #trimmed, the tablename becomes {a.b.c} - # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] - #ie whitespace is only irrelevant if it's outside a quoted segment - #trimmed, the tablename becomes {a.b."c etc "} - proc tablename_trim {tablename} { - set segments [tablename_split $tablename false] - set trimmed_segments [list] - foreach seg $segments { - lappend trimmed_segments [::string trim $seg " \t"] - } - return [join $trimmed_segments .] - } - - #basic generic quote matching for single and double quotes - #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes - proc tok_in_quotedpart {tok} { - set sLen [tcl::string::length $tok] - set quote_type "" - set had_slash 0 - for {set i 0} {$i < $sLen} {incr i} { - set c [tcl::string::index $tok $i] - if {$quote_type eq ""} { - if {$had_slash} { - #don't enter quote mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - set quote_type dq - } - sq { - set quote_type sq - } - bsl { - set had_slash 1 - } - } - } - } else { - if {$had_slash} { - #don't leave quoted mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - if {$quote_type eq "dq"} { - set quote_type "" - } - } - sq { - if {$quote_type eq "sq"} { - set quote_type "" - } - } - bsl { - set had_slash 1 - } - } - } - } - } - return $quote_type ;#dq | sq - } - - #utils::tablename_split - proc tablename_split {tablename {normalize false}} { - #we can't just split on . because we have to handle quoted segments which may contain a dot. - #eg {dog."tater.man"} - set sLen [tcl::string::length $tablename] - set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax - #quoted is for double-quotes, litquoted is for single-quotes (string literal) - set seg "" - for {set i 0} {$i < $sLen} {incr i} { - - if {$i > 0} { - set lastChar [tcl::string::index $tablename [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $tablename $i] - - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" - } - } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - lappend segments $seg - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" - } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c - } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" - } - if {$mode eq "unknown"} { - set mode "unquoted" - } - append seg $c - } - if {$i == $sLen-1} { - #end of data - ::tomlish::log::debug "End of data: mode='$mode'" - switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments $seg - } else { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } - } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } - lappend segments $seg - } - unquoted - unknown { - lappend segments $seg - } - syntax { - #ok - segment already lappended - } - default { - lappend segments $seg - } - } - } - } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } - return $segments - } - - proc unicode_escape_info {slashu} { - #!todo - # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and - # is a valid 'unicode scalar value' - # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive - #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} - if {[tcl::string::match {\\u*} $slashu]} { - set exp {^\\u([0-9a-fA-F]{4}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %4x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } - } else { - return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] - } - } elseif {[tcl::string::match {\\U*} $slashu]} { - set exp {^\\U([0-9a-fA-F]{8}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %8x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } else { - return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] - } - } - } else { - return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] - } - } else { - return [list err [list reason "Supplied string did not start with \\u or \\U" ]] - } - - } - - proc unescape_string {str} { - #note we can't just use Tcl subst because: - # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. - # it would strip out backslashes inappropriately: e.g "\j" becomes just j - # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn - # it replaces\ with a single whitespace - #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh - - set buffer "" - set buffer4 "" ;#buffer for 4 hex characters following a \u - set buffer8 "" ;#buffer for 8 hex characters following a \u - - set sLen [tcl::string::length $str] - - #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc - set slash_active 0 - set unicode4_active 0 - set unicode8_active 0 - - - #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? - set i 0 - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $str [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $str $i] - ::tomlish::log::debug "unescape_string. got char $c" - scan $c %c n - if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { - #we don't expect unescaped unicode characters from 0000 to 001F - - #*except* for raw tab (which is whitespace) and newlines - error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" - } - incr i ;#must incr here because we do'returns'inside the loop - if {$c eq "\\"} { - if {$slash_active} { - append buffer "\\" - set slash_active 0 - } elseif {$unicode4_active} { - error "unescape_string. unexpected case slash during unicode4 not yet handled" - } elseif {$unicode8_active} { - error "unescape_string. unexpected case slash during unicode8 not yet handled" - } else { - # don't output anything (yet) - set slash_active 1 - } - } else { - if {$unicode4_active} { - if {[tcl::string::length $buffer4] < 4} { - append buffer4 $c - } - if {[tcl::string::length $buffer4] == 4} { - #we have a \uHHHH to test - set unicode4_active 0 - set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$unicode8_active} { - if {[tcl::string::length $buffer8] < 8} { - append buffer8 $c - } - if {[tcl::string::length $buffer8] == 8} { - #we have a \UHHHHHHHH to test - set unicode8_active 0 - set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$slash_active} { - set slash_active 0 - set ctest [tcl::string::map {{"} dq} $c] - switch -exact -- $ctest { - dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] - } - b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] - } - u { - set unicode4_active 1 - set buffer4 "" - } - U { - set unicode8_active 1 - set buffer8 "" - } - default { - set slash_active 0 - - append buffer "\\" - append buffer $c - } - } - } else { - append buffer $c - } - } - } - #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" - if {$unicode4_active} { - error "End of string reached before complete unicode escape sequence \uHHHH" - } - if {$unicode8_active} { - error "End of string reached before complete unicode escape sequence \UHHHHHHHH" - } - if {$slash_active} { - append buffer "\\" - } - return $buffer - } - - proc normalize_key {rawkey} { - set c1 [tcl::string::index $rawkey 0] - set c2 [tcl::string::index $rawkey end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes allowed within it. - set key [tcl::string::range $rawkey 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - # - set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only - set key [::tomlish::utils::unescape_string $keydata] - #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. - } else { - set key $rawkey - } - return $key - } - - proc string_to_slashu {string} { - set rv {} - foreach c [split $string {}] { - scan $c %c c - append rv {\u} - append rv [format %.4X $c] - } - return $rv - } - - #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. - proc nonprintable_to_slashu {s} { - set res "" - foreach i [split $s ""] { - scan $i %c c - - set printable 0 - if {($c>31) && ($c<127)} { - set printable 1 - } - if {$printable} {append res $i} else {append res \\u[format %.4X $c]} - } - set res - } ;#RS - - #check if str is valid for use as a toml bare key - #Early toml versions? only allowed letters + underscore + dash - proc is_barekey1 {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } else { - set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters match the regexp - return 1 - } else { - return 0 - } - } - } - - #from toml.abnf in github.com/toml-lang/toml - #unquoted-key = 1*unquoted-key-char - #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ - #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions - #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block - #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon - #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics - #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators - #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols - #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation - #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank - #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space - #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - variable re_barekey - set ranges [list] - lappend ranges {a-zA-Z0-9\_\-} - lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions - lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block - lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon - lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics - lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators - lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols - lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation - lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank - lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space - lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - set re_barekey {^[} - foreach r $ranges { - append re_barekey $r - } - append re_barekey {]+$} - - proc is_barekey {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } - variable re_barekey - return [regexp $re_barekey $str] - } - - #test only that the characters in str are valid for the toml specified type 'integer'. - proc int_validchars1 {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - #add support for hex,octal,binary 0x.. 0o.. 0b... - proc int_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_int {str} { - set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - - # --------------------------------------- - #check for leading zeroes in non 0x 0b 0o - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { - return 0 - } - # --------------------------------------- - - #check +,- only occur in the first position. - if {[tcl::string::last - $str] > 0} { - return 0 - } - if {[tcl::string::last + $str] > 0} { - return 0 - } - set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores - #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) - if {![tcl::string::is integer -strict $numeric_value]} { - return 0 - } - #!todo - check bounds only based on some config value - #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements. - #presumably very large numbers would have to be supplied in a toml file as strings. - #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max - if {$numeric_value > $::tomlish::max_int} { - return 0 - } - if {$numeric_value < $::tomlish::min_int} { - return 0 - } - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'float'. - proc float_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { - return 1 - } else { - #only allow lower case for these special values - as per Toml 1.0 spec - if {$str ni {inf +inf -inf nan +nan -nan}} { - return 0 - } else { - return 1 - } - } - } - - proc is_float {str} { - set matches [regexp -all {[eE0-9\_\-\+\.]} $str] - #don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) - if {$str in {inf +inf -inf nan +nan -nan}} { - return 1 - } - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) - #Toml spec also disallows leading zeros in the exponent part - #... but this seems less interoperable (some libraries generate leading zeroes in exponents) - #for now we will allow leading zeros in exponents - #!todo - configure 'strict' option to disallow? - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - set r {([0-9])*} - regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E - set z {([0])*} - regexp $z $intpart leadingzeros - if {[tcl::string::length $leadingzeros] > 1} { - return 0 - } - #for floats, +,- may occur in multiple places - #e.g -2E-22 +3e34 - #!todo - check bounds ? - - #strip underscores for tcl double check - set check [tcl::string::map {_ ""} $str] - #string is double accepts inf nan +NaN etc. - if {![tcl::string::is double $check]} { - return 0 - } - - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'datetime'. - proc datetime_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - #review - we - proc is_datetime {str} { - #e.g 1979-05-27 - #e.g 1979-05-27T00:32:00Z - #e.g 1979-05-27 00:32:00-07:00 - #e.g 1979-05-27 00:32:00+10:00 - #e.g 1979-05-27 00:32:00.999999-07:00 - - #review - #minimal datetimes? - # 2024 ok - shortest valid 4 digit year? - # 02:00 ok - # 05-17 ok - if {[string length $str] < 4} { - return 0 - } - - set matches [regexp -all {[zZtT0-9\-\+\.:]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - #!todo - use full RFC 3339 parser? - lassign [split $str T] datepart timepart - #!todo - what if the value is 'time only'? - - #Tcl's free-form clock scan (no -format option) is deprecated - # - #if {[catch {clock scan $datepart} err]} { - # puts stderr "tcl clock scan failed err:'$err'" - # return 0 - #} - - #!todo - verify time part is reasonable - } else { - return 0 - } - return 1 - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] -} - -namespace eval tomlish::parse { - #*** !doctools - #[subsection {Namespace tomlish::parse}] - #[para] - #[list_begin definitions] - - #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. - #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: - # - e.g some kind of backtracking required if using an ABNF parser? - #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" - #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' - - #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? - - #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) - - - variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text - - variable state - # states: - # table-space, itable-space, array-space - # value-expected, keyval-syntax, - # quoted-key, squoted-key - # string-state, literal-state, multistring... - # - # notes: - # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack - - # - # value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax - # - #stateMatrix defines for each state, actions to take for each possible token. - #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. - #dual-element actions are a push instruction and the name of the space to push on the stack. - # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) - - # -- --- --- --- --- --- - #token/state naming guide - # -- --- --- --- --- --- - #tokens : underscore separated or bare name e.g newline, start_quote, start_squote - #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence - #states : always contain at least one dash e.g err-state, table-space - #instructions - # -- --- --- --- --- --- - - - #stateMatrix dict of elements mapping current state to next state based on returned tokens - # current-state {token-encountered next-state ... } - # where next-state can be a 1 or 2 element list. - #If 2 element - the first item is an instruction (ucase) - #If 1 element - it is either a lowercase dashed state name or an ucase instruction - #e.g {PUSHSPACE } or POPSPACE or SAMESPACE - - - #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases - - variable stateMatrix - set stateMatrix [dict create] - - #xxx-space vs xxx-syntax inadequately documented - TODO - - # --------------------------------------------------------------------------------------------------------------# - # incomplete example of some state starting at table-space - # --------------------------------------------------------------------------------------------------------------# - # ( = -> value-expected) - # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) - # keyval-space (autotransition on push ^) - # table-space (barekey^) (startquote -> quoted-key ^) - # --------------------------------------------------------------------------------------------------------------# - - dict set stateMatrix\ - table-space { - bom "table-space"\ - whitespace "table-space"\ - newline "table-space"\ - barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ - startquote "quoted-key"\ - XXXstartsquote "squoted-key"\ - comment "table-space"\ - starttablename "tablename-state"\ - starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ - comma "err-state"\ - eof "end-state"\ - equal "err-state"\ - } - - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}}\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - startquote "quoted-key"\ - startsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "itable-space"\ - comment "err-state"\ - eof "err-state"\ - } - - - dict set stateMatrix\ - keyval-space {\ - whitespace "keyval-syntax"\ - equal "keyval-value-expected"\ - } - - # ' = ' portion of keyval - dict set stateMatrix\ - keyval-syntax {\ - whitespace "keyval-syntax"\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - barekey {PUSHSPACE "dottedkey-space"}\ - equal "keyval-value-expected"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #### - dict set stateMatrix\ - keyval-value-expected {\ - whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ - } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } - - dict set stateMatrix\ - keyval-tail {\ - whitespace "keyval-tail"\ - newline "POPSPACE"\ - comment "keyval-tail"\ - eof "end-state"\ - } - - dict set stateMatrix\ - itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - barekey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ - newline "err-state"\ - eof "err-state"\ - } - dict set stateMatrix\ - itable-keyval-value-expected {\ - whitespace "itable-keyval-value-expected"\ - untyped_value {TOSTATE "itable-val-tail" note ""}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ - startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ - startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ - } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } - - dict set stateMatrix\ - itable-val-tail {\ - whitespace "itable-val-tail"\ - endinlinetable "POPSPACE"\ - comma "POPSPACE"\ - Xnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "err-state"\ - comment "itable-val-tail"\ - eof "err-state"\ - } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } - - - - - - dict set stateMatrix\ - value-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - startmultiquote {PUSHSPACE "multistring-space"}\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - - #dottedkey-space is not used within [tablename] or [[tablearrayname]] - #it is for keyval ie x.y.z = value - dict set stateMatrix\ - dottedkey-space {\ - whitespace "dottedkey-space"\ - dotsep "dottedkey-space"\ - barekey "dottedkey-space"\ - squotedkey "dottedkey-space"\ - quotedkey "dottedkey-space"\ - equal "POPSPACE"\ - newline "err-state"\ - comma "err-state"\ - comment "err-state"\ - } - #dottedkeyend "POPSPACE" - - - - - #REVIEW - #toml spec looks like heading towards allowing newlines within inline tables - #https://github.com/toml-lang/toml/issues/781 - dict set stateMatrix\ - curly-syntax {\ - whitespace "curly-syntax"\ - newline "curly-syntax"\ - barekey {PUSHSPACE "itable-keyval-space"}\ - itablequotedkey "itable-keyval-space"\ - endinlinetable "POPSPACE"\ - startquote "itable-quoted-key"\ - comma "itable-space"\ - comment "itable-space"\ - eof "err-state"\ - } - #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES - #We currently allow multiline ITABLES (also with comments) in the tokenizer. - #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? - - - #JMN REVIEW - dict set stateMatrix\ - array-space {\ - whitespace "array-space"\ - newline "array-space"\ - untyped_value "SAMESPACE"\ - startarray {PUSHSPACE "array-space"}\ - endarray "POPSPACE"\ - startmultiquote {PUSHSPACE multistring-space}\ - startinlinetable {PUSHSPACE itable-space}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ - comma "array-space"\ - comment "array-space"\ - eof "err-state-array-space-got-eof"\ - } - dict set stateMatrix\ - array-syntax {\ - whitespace "array-syntax"\ - newline "array-syntax"\ - untyped_value "SAMESPACE"\ - startarray {PUSHSPACE array-space}\ - endarray "POPSPACE"\ - startmultiquote {PUSHSPACE multistring-space}\ - startquote "string-state"\ - startsquote "literal-state"\ - comma "array-space"\ - comment "err-state"\ - } - - - - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - quotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - dict set stateMatrix\ - squoted-key {\ - whitespace "NA"\ - squotedkey "squoted-key"\ - newline "err-state"\ - } - # endsquote {PUSHSPACE "keyval-space"} - - dict set stateMatrix\ - string-state {\ - whitespace "NA"\ - string "string-state"\ - endquote "SAMESPACE"\ - newline "err-state"\ - eof "err-state"\ - } - dict set stateMatrix\ - literal-state {\ - whitespace "NA"\ - literal "literal-state"\ - endsquote "SAMESPACE"\ - newline "err-state"\ - eof "err-state"\ - } - - - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } - dict set stateMatrix\ - multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ - } - - - #only valid subparts are literalpart and newline. other whitespace etc is within literalpart - #todo - treat sole cr as part of literalpart but crlf and lf as newline - dict set stateMatrix\ - multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ - } - - #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes - dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ - } - - - dict set stateMatrix\ - tablename-state {\ - whitespace "NA"\ - tablename {zeropoppushspace table-space}\ - tablename2 {PUSHSPACE table-space}\ - endtablename "tablename-tail"\ - comma "err-state"\ - newline "err-state"\ - } - dict set stateMatrix\ - tablearrayname-state {\ - whitespace "NA"\ - tablearrayname {zeropoppushspace table-space}\ - tablearrayname2 {PUSHSPACE table-space}\ - endtablearray "tablearrayname-tail"\ - comma "err-state"\ - newline "err-state"\ - } - - dict set stateMatrix\ - tablename-tail {\ - whitespace "tablename-tail"\ - newline "table-space"\ - comment "tablename-tail"\ - eof "end-state"\ - } - dict set stateMatrix\ - tablearrayname-tail {\ - whitespace "tablearrayname-tail"\ - newline "table-space"\ - comment "tablearrayname-tail"\ - eof "end-state"\ - } - dict set stateMatrix\ - end-state {} - - set knowntokens [list] - set knownstates [list] - dict for {state transitions} $stateMatrix { - if {$state ni $knownstates} {lappend knownstates $state} - dict for {tok instructions} $transitions { - if {$tok ni $knowntokens} {lappend knowntokens $tok} - } - } - dict set stateMatrix nostate {} - foreach tok $knowntokens { - dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" - } - - - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #purpose - debugging? remove? - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #build a list of 'push triggers' from the stateMatrix - # ie tokens which can push a new space onto spacestack - set push_trigger_tokens [list] - tcl::dict::for {s transitions} $stateMatrix { - tcl::dict::for {token transition_to} $transitions { - set instruction [lindex $transition_to 0] - switch -exact -- $instruction { - PUSHSPACE - zeropoppushspace { - if {$token ni $push_trigger_tokens} { - lappend push_trigger_tokens $token - } - } - } - } - } - ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - - - - #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) - #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE - - #mainly for the -space states: - #redirect to another state $c based on a state transition from $whatever to $b - # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. - #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' - - #Push to, next - #default first states when we push to these spaces - variable spacePushTransitions { - keyval-space keyval-syntax - itable-keyval-space itable-keyval-syntax - array-space array-space - table-space tablename-state - } - #itable-space itable-space - #Pop to, next - variable spacePopTransitions { - array-space array-syntax - } - #itable-space curly-syntax - #itable-keyval-space itable-val-tail - #review - #we pop to keyval-space from dottedkey-space or from value-expected? we don't always want to go to keyval-tail - #leave it out and make the POPSPACE caller explicitly specify it - #keyval-space keyval-tail - - variable spaceSameTransitions { - array-space array-syntax - } - #itable-space curly-syntax - #itable-keyval-space itable-val-tail - - - variable state_list ;#reset every tomlish::decode::toml - - namespace export tomlish toml - namespace ensemble create - - #goNextState has various side-effects e.g pushes and pops spacestack - #REVIEW - setting nest and v elements here is ugly - #todo - make neater, more single-purpose? - proc goNextState {tokentype tok currentstate} { - variable state - variable nest - variable v - - set prevstate $currentstate - - - variable spacePopTransitions - variable spacePushTransitions - variable spaceSameTransitions - - variable last_space_action "none" - variable last_space_type "none" - variable state_list - - set result "" - set starttok "" - - if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { - set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] - ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" - switch -exact -- [lindex $transition_to 0] { - POPSPACE { - spacestack pop - set parent_info [spacestack peek] - set type [dict get $parent_info type] - set parentspace [dict get $parent_info state] - - set last_space_action "pop" - set last_space_type $type - - if {[dict exists $parent_info returnstate]} { - set next [dict get $parent_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" - } else { - ### - #review - do away with spacePopTransitions - which although useful to provide a default.. - # - involve error-prone configurations distant to the main state transition configuration in stateMatrix - if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { - set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" - } else { - set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" - } - } - set result $next - } - SAMESPACE { - set currentspace_info [spacestack peek] - ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" - set type [dict get $currentspace_info type] - set currentspace [dict get $currentspace_info state] - - if {[dict exists $currentspace_info returnstate]} { - set next [dict get $currentspace_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" - } else { - if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { - set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" - } else { - set next $currentspace - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" - } - } - set result $next - } - zeropoppushspace { - if {$nest > 0} { - #pop back down to the root level (table-space) - spacestack pop - set parentinfo [spacestack peek] - set type [dict get $parentinfo type] - set target [dict get $parentinfo state] - - set last_space_action "pop" - set last_space_type $type - - #----- - #standard pop - set parentlevel [expr {$nest -1}] - lappend v($parentlevel) [set v($nest)] - incr nest -1 - #----- - } - #re-entrancy - - #set next [list PUSHSPACE [lindex $transition_to 1]] - set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 - #::tomlish::log::notice "goNextState REENTRANCY. calling goNextState $nexttokentype $tokentype" - #set result [::tomlish::parse::goNextState $nexttokentype $tokentype] - ::tomlish::log::debug "--->> zeropoppushspace goNextState REENTRANCY. calling goNextState $nexttokentype $currentstate" - set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] - set result [dict get $transition_info newstate] - } - PUSHSPACE { - set original_target [dict get $transition_to PUSHSPACE] - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - if {[dict exists $transition_to starttok]} { - set starttok [dict get $transition_to starttok] - } - spacestack push [dict create type space state $original_target] - - set last_space_action "push" - set last_space_type "space" - - if {[dict exists $transition_to state]} { - #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) - set next [dict get $transition_to state] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" - } else { - #puts $::tomlish::parse::spacePushTransitions - if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { - set next [dict get $::tomlish::parse::spacePushTransitions $original_target] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " - } else { - set next $original_target - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" - } - } - set result $next - } - TOSTATE { - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - set result [dict get $transition_to TOSTATE] - } - default { - #simplified version of TOSTATE - set result [lindex $transition_to 0] ;#ignore everything but first word - } - } - } else { - ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" - set result "nostate" - } - lappend state_list [list tokentype $tokentype from $currentstate to $result] - set state $result - ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " - return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] - } - - proc report_line {{line ""}} { - variable linenum - variable is_parsing - if {$is_parsing} { - if {$line eq ""} { - set line $linenum - } - return "Line Number: $line" - } else { - #not in the middle of parsing tomlish text - return nothing. - return "" - } - } - - #produce a *slightly* more readable string rep of the nest for puts etc. - proc nest_pretty1 {list} { - set prettier "{" - - foreach el $list { - if { [lindex $el 0] eq "NEWLINE"} { - append prettier "[list $el]\n" - } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY QKEY SQKEY TABLE ARRAY})} { - append prettier [nest_pretty1 $el] - } else { - append prettier "[list $el] " - } - } - append prettier "}" - return $prettier - } - - proc set_tokenType {t} { - variable tokenType - variable tokenType_list - if {![info exists tokenType]} { - set tokenType "" - } - lappend tokenType_list $t - set tokenType $t - } - - proc switch_tokenType {t} { - variable tokenType - variable tokenType_list - lset tokenType_list end $t - set tokenType $t - } - - proc get_tokenType {} { - variable tokenType - return $tokenType - } - - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } - - proc get_token_waiting {} { - variable token_waiting - return [lindex $token_waiting 0] - } - proc clear_token_waiting {} { - variable token_waiting - set token_waiting [list] - } - - #token_waiting is a list - but our standard case is to have only one - #in certain circumstances such as near eof we may have 2 - #the set_token_waiting function only allows setting when there is not already one waiting. - #we want to catch cases of inadvertently trying to set multiple - # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. - proc set_token_waiting {args} { - if {[llength $args] %2 != 0} { - error "tomlish set_token_waiting must have args of form: type value complete 0|1" - } - variable token_waiting - - if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { - #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another - #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context - #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it - set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" - append err \n " - cannot add token_waiting: $args" - error $err - #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] - #set token_waiting [list] - } - - set waiting [dict create] - dict for {k v} $args { - switch -exact $k { - type - complete { - dict set waiting $k $v - } - value { - dict set waiting tok $v - } - startindex { - dict set waiting startindex $v - } - default { - error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" - } - } - } - if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { - error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" - } - if {![llength $token_waiting]} { - set token_waiting [list $waiting] - } else { - #an extra sanity-check that we don't have more than just the eof.. - if {[llength $token_waiting] > 1} { - set err "tomlish Unexpected. Existing token_waiting count > 1.\n" - foreach tw $token_waiting { - append err " $tw" \n - } - append err " - cannot add token_waiting: $waiting" - error $err - } - #last entry must be a waiting eof - set token_waiting [list $waiting [lindex $token_waiting end]] - } - return - } - - #returns 0 or 1 - #tomlish::parse::tok - #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag - # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) - # - interactive use? - - proc tok {s} { - variable nest - variable v - variable i - variable tok - variable type ;#character type - variable state ;#FSM - - - variable tokenType - variable tokenType_list - - - variable endToken - - variable lastChar - - variable braceCount - variable bracketCount - - - #------------------------------ - #Previous run found another (presumably single-char) token - #The normal case is for there to be only one dict in the list - #multiple is an exception - primarily for eof - variable token_waiting - if {[llength $token_waiting]} { - set waiting [lindex $token_waiting 0] - - set tokenType [dict get $waiting type] - set tok [dict get $waiting tok] - #todo: dict get $token_waiting complete - set token_waiting [lrange $token_waiting 1 end] - return 1 - } - #------------------------------ - - set resultlist [list] - set sLen [tcl::string::length $s] - - set slash_active 0 - set quote 0 - set c "" - set multi_dquote "" - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $s [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $s $i] - set cindex $i - tomlish::log::debug "- tokloop char <$c> index $i tokenType:$tokenType tok:<$tok>" - #puts "got char $c during tokenType '$tokenType'" - incr i ;#must incr here because we do returns inside the loop - - set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] - switch -exact -- $ctest { - # { - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" - } - whitespace { - # hash marks end of whitespace token - #do a return for the whitespace, set token_waiting - #set_token_waiting type comment value "" complete 1 - incr i -1 ;#leave comment for next run - return 1 - } - untyped_value { - #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? - #we will accept a comment marker as an immediate terminator of the untyped_value. - incr i -1 - return 1 - } - starttablename - starttablearrayname { - #fix! - error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out - append tok $c - } - default { - #quotedkey, itablequotedkey, string,literal, multistring - append tok $c - } - } - } else { - switch -- $state { - multistring-space { - set_tokenType stringpart - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes#" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "#" - } - default { - #start of token if we're not in a token - set_tokenType comment - set tok "" ;#The hash is not part of the comment data - } - } - } - } - lc { - #left curly brace - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey - itablesquotedkey { - append tok $c - } - string - quotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename - starttablearrayname { - #*bare* tablename can only contain letters,digits underscores - error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #valid in quoted parts - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - switch -exact -- $state { - itable-keyval-value-expected - keyval-value-expected - value-expected { - #switch last key to tablename?? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - array-space - array-syntax { - #nested anonymous inline table - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\{" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\{" - } - default { - error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" - } - } - } - - } - rc { - #right curly brace - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey - itablesquotedkey { - append tok $c - } - string - quotedkey - itablequotedkey - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename - tablename { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endinlinetable value "" complete 1 startindex $cindex - return 1 - } - starttablearrayname - tablearrayname { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - itable-val-tail { - #review - error "tomlish right-curly in itable-val-tail" - } - default { - #end any other token - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - value-expected { - #invalid - but allow parser statemachine to report it. - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-space { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname-state problem" - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - #invalid - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - curly-syntax { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-val-tail { - set_tokenType "endinlinetable" - set tok "" - #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 - incr i -1 - return 1 - } - itable-keyval-syntax { - error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\}" - } - multiliteral-space { - set_tokenType "literalpart" ; #review - set tok "\}" - } - default { - #JMN2024b keyval-tail? - error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" - } - } - } - - } - lb { - #left square bracket - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey - itablesquotedkey { - append tok $c - } - string - quotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename { - #change the tokenType - switch_tokenType "starttablearrayname" - set tok "" ;#no output into the tomlish list for this token - #any following whitespace is part of the tablearrayname, so return now - return 1 - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\[" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - #invalid at this point - state machine should disallow table -> starttablearrayname - set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "\[" - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - keyval-value-expected - itable-keyval-value-expected - value-expected { - set_tokenType "startarray" - set tok "\[" - return 1 - } - table-space { - #table name - #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray - #note that a starttablearrayname token may contain whitespace between the brackets - # e.g \[ \[ - set_tokenType "starttablename" - set tok "" ;#there is no output into the tomlish list for this token - } - array-space - array-syntax { - #nested array? - set_tokenType "startarray" - set tok "\[" - return 1 - #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\[" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\[" - } - itable-space { - #handle state just to give specific error msg - error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" - } - default { - error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - rb { - #right square bracket - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey - itablesquotedkey { - append tok $c - } - string - quotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - #???? - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } else { - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablename value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - tablearraynames { - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - default { - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - value-expected { - #invalid - but allow parser statemachine to report it. - set_tokenType "endarray" - set tok "\]" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endarray" - set tok "\]" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endtablename" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname problem" - set_tokenType "endtablearray" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - set_tokenType "endarray" - set tok "\]" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\]" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\]" - } - default { - error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - #backslash - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - #end whitespace token - incr i -1 ;#reprocess bsl in next run - return 1 - } else { - error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" - } - } - literal - literalpart - squotedkey - itablesquotedkey { - #never need to set slash_active true when in single quoted tokens - append tok "\\" - set slash_active 0 - } - string - quotedkey - itablequotedkey - comment { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - stringpart { - if {$slash_active} { - #assert - quotes empty - or we wouldn't have slash_active - set slash_active 0 - append tok "\\\\" - } else { - append tok $dquotes - set slash_active 1 - } - } - starttablename - starttablearrayname { - error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" - } - tablename - tablearrayname { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - barekey { - error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" - } - default { - error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$slash_active} { - set_tokenType "stringpart" - set tok "\\\\" - set slash_active 0 - } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } - set slash_active 1 - } - } - multiliteral-space { - #nothing can be escaped in multiliteral-space - not even squotes (?) review - set_tokenType "literalpart" - set tok "\\" - } - default { - error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" - } - } - } - } - sq { - #single quote - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } - } - } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - #temp token creatable only during value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - append tok $c - } - 2 { - #switch? - append tok $c - set_tokenType triple_squote - return 1 - } - default { - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" - } - } - } - literal { - #slash_active always false - #terminate the literal - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - literalpart { - #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) - #todo - # idea: end this literalpart (possibly 'temporarily') - # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack - # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing - return 1 - } - itablesquotedkey { - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - squotedkey { - ### - #set_token_waiting type endsquote value "'" complete 1 - return 1 - } - starttablename - starttablearrayname { - #!!! - incr i -1 - return 1 - } - tablename - tablearrayname { - append tok $c - } - default { - append tok $c - } - } - } else { - switch -exact -- $state { - value-expected - array-space { - set_tokenType "_start_squote_sequence" - set tok "'" - } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - table-space { - ### - set_tokenType "squotedkey" - set tok "" - } - itable-space { - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - tablename-state { - #first char in tablename-state/tablearrayname-state - set_tokenType tablename - append tok "'" - } - tablearrayname-state { - set_tokenType tablearrayname - append tok "'" - } - literal-state { - tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal - incr -1 - return 1 - } - multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - dottedkey-space { - set_tokenType squotedkey - } - default { - error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" - } - } - } - - } - dq { - #double quote - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - set_tokenType "startsquote" - incr i -1 - return 1 - } - 2 { - set_tokenType "startsquote" - incr i -2 - return 1 - } - default { - error "tomlish unexpected _start_squote_sequence length $toklen" - } - } - } - literal - literalpart { - append tok $c - } - string { - if {$had_slash} { - append tok "\\" $c - } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex - return 1 - } - } - stringpart { - #sub element of multistring - if {$had_slash} { - append tok "\\" $c - } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } - } - } - whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected - value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - quotedkey - itablequotedkey { - if {$had_slash} { - append tok "\\" - append tok $c - } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex - return 1 - } - } - squotedkey - itablesquotedkey { - append tok $c - } - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - incr i -1 ;## - return 1 - } - default { - error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - #$slash_active not relevant when no tokenType - #token is string only if we're expecting a value at this point - switch -exact -- $state { - keyval-value-expected - value-expected - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - itable-keyval-value-expected { - #JMN 2025 - review - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - multistring-space { - #TODO - had_slash!!! - #REVIEW - if {$had_slash} { - set_tokenType "stringpart" - set tok "\\\"" - set multi_dquote "" - } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\"" - } - table-space { - set_tokenType "startquote" - set tok $c - return 1 - } - itable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - dottedkey-space { - set_tokenType dquote_seq_begin - set tok $c - } - default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" - } - } - } - } - = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" - append tok $c - } - string - comment - quotedkey - itablequotedkey { - #for these tokenTypes an = is just data. - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - whitespace { - if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - set_token_waiting type equal value = complete 1 startindex $cindex - return 1 - } - } - barekey { - #set_token_waiting type equal value = complete 1 - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out - append tok $c - } - default { - error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok ${dquotes}= - } - multiliteral-space { - set_tokenType "literalpart" - set tok "=" - } - dottedkey-space { - set_tokenType "equal" - set tok "=" - return 1 - } - default { - set_tokenType "equal" - set tok = - return 1 - } - } - } - } - cr { - #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! - # \r carriage return - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal { - append tok $c - } - literalpart { - #part of MLL string (multi-line literal string) - #we need to split out crlf as a separate NEWLINE to be consistent - ::tomlish::log::warn "literalpart ended by cr - needs testing" - #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space - incr i -1 - return 1 - } - stringpart { - #part of MLB string (multi-line basic string) - #jmn2025 - review - #append tok $dquotes$c - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #could in theory be valid in quoted part of name - #review - might be better just to disallow here - append tok $c - } - default { - #!todo - error out if cr inappropriate for tokenType - append tok $c - } - } - } else { - #lf may be appended if next - #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) - set_tokenType "newline" - set tok cr - } - } - lf { - # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal { - #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' - #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - literalpart { - #we allow newlines - but store them within the multiliteral as their own element - #This is a legitimate end to the literalpart - but not the whole multiliteral - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - newline { - #review - #this lf is the trailing part of a crlf - append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok - return 1 - } - stringpart { - if {$dquotes ne ""} { - append tok $dquotes - incr i -1 - return 1 - } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } - starttablename - tablename - tablearrayname - starttablearrayname { - error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" - } - default { - #newline ends all other tokens. - #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) - #note for whitespace: - # we will use the convention that \n terminates the current whitespace even if whitespace follows - # ie whitespace is split into separate whitespace tokens at each newline - - #puts "-------------- newline lf during tokenType $tokenType" - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } - set_tokenType "newline" - set tok lf - return 1 - } - } - multiliteral-space { - #assert had_slash 0, multi_dquote "" - set_tokenType "newline" - set tok "lf" - return 1 - } - default { - #ignore slash? error? - set_tokenType "newline" - set tok lf - return 1 - } - } - #if {$had_slash} { - # #CONT directly before newline - allows strings_5_byteequivalent test to pass - # set_tokenType "continuation" - # set tok "\\" - # incr i -1 - # return 1 - #} else { - # set_tokenType newline - # set tok lf - # return 1 - #} - } - } - , { - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - comment - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok , - } - string - quotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - #stringpart can have up to 2 quotes too - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - literal - literalpart - squotedkey - itablesquotedkey { - #assert had_slash always 0, multi_dquote "" - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - set_token_waiting type comma value "," complete 1 startindex $cindex - return 1 - } - } - default { - set_token_waiting type comma value "," complete 1 startindex $cindex - if {$had_slash} {append tok "\\"} - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "$dquotes," - } - multiliteral-space { - #assert had_slash 0, multi_dquote "" - set_tokenType "literalpart" - set tok "," - } - default { - set_tokenType "comma" - set tok "," - return 1 - } - } - } - } - . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - comment - untyped_value { - if {$had_slash} {append tok "\\"} - append tok $c - } - string - quotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - literal - literalpart - squotedkey - itablesquotedkey { - #assert had_slash always 0, multi_dquote "" - append tok $c - } - whitespace { - switch -exact -- $state { - multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] - if {$had_slash} { - incr backchars 1 - } - incr i -$backchars - return 1 - } - dottedkey-space { - incr i -1 - return 1 - } - default { - error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" - } - } - } - starttablename - starttablearrayname { - #This would correspond to an empty table name - error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #subtable - split later - review - append tok $c - } - barekey { - #e.g x.y = 1 - #we need to transition the barekey to become a structured table name ??? review - #x is the tablename y is the key - set_token_waiting type dotsep value "." complete 1 startindex $cindex - return 1 - } - default { - error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" - #set_token_waiting type period value . complete 1 - #return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "$dquotes." - } - multiliteral-space { - set_tokenType "literalpart" - set tok "." - } - dottedkey-space { - ### - set_tokenType "dotsep" - set tok "." - return 1 - } - default { - set_tokenType "untyped_value" - set tok "." - } - } - } - - } - " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - #todo had_slash - emit token or error - #whitespace is a terminator for bare keys - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - comment { - if {$had_slash} { - append tok "\\" - } - append tok $dquotes$c - } - string - quotedkey - itablequotedkey { - if {$had_slash} { append tok "\\" } - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - append tok $dquotes - incr i -1 - return 1 - } - } - literal - literalpart - squotedkey - itablesquotedkey { - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } - } else { - append tok $c - } - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } - set_tokenType "whitespace" - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - if {$had_slash} { - error "tomlish unexpected backslash [tomlish::parse::report_line]" - } - set_tokenType "whitespace" - append tok $c - } - } - } - } - tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - #whitespace is a terminator for bare keys - incr i -1 - #set_token_waiting type whitespace value $c complete 1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - quotedkey - itablequotedkey - squotedkey - itablesquotedkey { - append tok $c - } - string - comment - whitespace { - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - append tok $dquotes - incr i -1 - return 1 - } - } - literal - literalpart { - append tok $c - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearraynames { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - if {$slash_active} { - set slash_active 0 - } - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "whitespace" - append tok $c - } - } - } - } - bom { - #BOM (Byte Order Mark) - ignored by token consumer - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - _start_squote_sequence { - #assert - tok will be one or two squotes only - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart { - append tok $c - } - default { - set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "bom" - set tok "\uFEFF" - return 1 - } - } - } - } - default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } - } else { - #review - incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. - return 1 - } - } - barekey { - if {[tomlish::utils::is_barekey $c]} { - append tok $c - } else { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" - } - } - starttablename - starttablearrayname { - incr i -1 - #allow statemachine to set context for subsequent chars - return 1 - } - stringpart { - append tok $dquotes$c - } - default { - #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname - append tok $c - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - table-space - itable-space { - #if no currently active token - assume another key value pair - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - curly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - multistring-space { - set_tokenType "stringpart" - if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time - set tok \\$c - } else { - set tok $dquotes$c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - tablename-state { - set_tokenType "tablename" - set tok $c - } - tablearrayname-state { - set_tokenType "tablearrayname" - set tok $c - } - dottedkey-space { - set_tokenType barekey - set tok $c - } - default { - tomlish::log::debug "- tokloop char '$c' setting to untyped_value while state:$state" - set_tokenType "untyped_value" - set tok $c - } - } - } - } - } - - } - - #run out of characters (eof) - if {[tcl::string::length $tokenType]} { - #check for invalid ending tokens - #if {$state eq "err-state"} { - # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" - #} - switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open literal - error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" - } - 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] - set_tokenType "literal" - set tok "" - return 1 - } - } - } - } - set_token_waiting type eof value eof complete 1 startindex $i ;#review - return 1 - } else { - ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" - set tokenType "eof" - set tok "eof" - } - return 0 - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] -} - -namespace eval tomlish::dict { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - - proc is_tomlish_typeval {d} { - #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} - #as a sanity check we need to avoid mistaking user data that happens to match same form - #consider x.y={type="spud",value="blah"} - #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. - #check the length of the type as a quick way to see it's a tag - not something else masqerading. - expr {[dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} - } - proc is_tomlish_typeval2 {d} { - upvar ::tomlish::tags tags - expr {[lindex $d 0] eq "type" && [lindex $d 1] in $tags} - } - proc last_tomltype_posn {d} { - set last_simple -1 - set dictposn [expr {[dict size $d] -1}] - foreach k [lreverse [dict keys $d]] { - set dval [dict get $d $k] - if {[is_tomlish_typeval $dval]} { - set last_simple $dictposn - break - } - incr dictposn -1 - } - return $last_simple - } - - - #review - proc name_from_tablestack {tablestack} { - set name "" - foreach tinfo [lrange $tablestack 1 end] { - lassign $tinfo type namepart - switch -- $type { - T { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - I { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - default { - #end at first break in the leading sequence of T & I tablenames - break - } - } - } - return $name - } - -} - -tcl::namespace::eval tomlish::app { - variable applist [list encoder decoder test] - - #*** !doctools - #[subsection {Namespace tomlish::app}] - #[para] - #[list_begin definitions] - - proc decoder {args} { - #*** !doctools - #[call app::[fun decoder] [arg args]] - #[para] read toml on stdin until EOF - #[para] on error - returns non-zero exit code and writes error on stderr - #[para] on success - returns zero exit code and writes JSON encoding of the data on stdout - #[para] This decoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - #fconfigure stdin -encoding utf-8 - fconfigure stdin -translation binary - #Just slurp it all - presumably we are not handling massive amounts of data on stdin. - # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. - if {[catch { - set toml [read stdin] - }]} { - exit 2 ;#read error - } - try { - set j [::tomlish::toml_to_json $toml] - } on error {em} { - puts stderr "decoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $j - exit 0 - } - - proc encoder {args} { - #*** !doctools - #[call app::[fun encoder] [arg args]] - #[para] read JSON on stdin until EOF - #[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation - #[para] return zero exitcode and TOML data on stdout if JSON data can be converted. - #[para] This encoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - fconfigure stdin -translation binary - if {[catch { - set json [read stdin] - }]} { - exit 2 ;#read error - } - try { - set toml [::tomlish::json_to_toml $json] - } on error {em} { - puts stderr "encoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $toml - exit 0 - } - - proc test {args} { - set opts [dict merge [dict create] $args] - - package require test::tomlish - if {[dict exists $opts -suite]} { - test::tomlish::suite [dict get $opts -suite] - } - test::tomlish::run - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::app ---}] -} - -proc ::tomlish::appnames {} { - set applist [list] - foreach cmd [info commands ::tomlish::app::*] { - lappend applist [namespace tail $cmd] - } - return $applist -} - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Secondary API namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish::lib { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - #*** !doctools - #[subsection {Namespace tomlish::lib}] - #[para] Secondary functions that are part of the API - #[list_begin definitions] - - #proc utility1 {p1 args} { - # #*** !doctools - # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 - #} - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -if {$argc > 0} { - puts stderr "argc: $argc args: $argv" - - if {($argc == 1)} { - if {[tcl::string::tolower $argv] in {help -help h -h}} { - puts stdout "Usage: -app where appname one of:[tomlish::appnames]" - exit 0 - } else { - puts stderr "Argument '$argv' not understood. Try -help" - exit 1 - } - } - set opts [dict create] - set opts [dict merge $opts $argv] - - set opts_understood [list -app ] - if {"-app" in [dict keys $opts]} { - #Don't vet the remaining opts - as they are interpreted by each app - } else { - foreach key [dict keys $opts] { - if {$key ni $opts_understood} { - puts stderr "Option '$key' not understood" - exit 1 - } - } - } - if {[dict exists $opts -app]} { - set app [dict get $opts -app] - if {$app ni [tomlish::appnames]} { - puts stderr "app '[dict get $opts -app]' not found" - exit 1 - } - tomlish::app::$app {*}$opts - } -} - -## Ready -package provide tomlish [namespace eval tomlish { - variable pkg tomlish - variable version - set version 1.1.2 -}] -return - -#*** !doctools -#[manpage_end] - diff --git a/src/vendormodules/tomlish-1.1.3.tm b/src/vendormodules/tomlish-1.1.3.tm deleted file mode 100644 index 3da39427..00000000 --- a/src/vendormodules/tomlish-1.1.3.tm +++ /dev/null @@ -1,6002 +0,0 @@ -# -*- tcl -*- -# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt -# -# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. -# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# (C) 2024 -# -# @@ Meta Begin -# Application tomlish 1.1.3 -# Meta platform tcl -# Meta license -# @@ Meta End - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# doctools header -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -#*** !doctools -#[manpage_begin tomlish_module_tomlish 0 1.1.3] -#[copyright "2024"] -#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] -#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] -#[require tomlish] -#[keywords module parsing toml configuration] -#[description] -#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) -#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml -#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, -#[para] although these other formats are generally unlikely to retain whitespace or comments -#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. -#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. -#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions -#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key -#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) -#[para] will need a -type option (-force ?) to force overriding with another type such as an int. - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section Overview] -#[para] overview of tomlish -#[subsection Concepts] -#[para] - - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -## Requirements -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[subsection dependencies] -#[para] packages used by tomlish -#[list_begin itemized] - -package require Tcl 8.6- -package require struct::stack -package require logger - -#*** !doctools -#[item] [package {Tcl 8.6-}] -#[item] [package {struct::stack}] - -#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') - - -#*** !doctools -#[list_end] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section API] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Base namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish { - namespace export {[a-z]*}; # Convention: export all lowercase - variable types - - #IDEAS: - # since get_toml produces tomlish with whitespace/comments intact: - # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace - # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? - # - separate addKey?? - # - deleteKey (delete leaf) - # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) - # - set/add Table? - position in doc based on existing tables/subtables? - - #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. - #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n - #The newline is part of the keyval structure so makes reordering easier - #example from_toml "a=1\nb=2\n\n\n" - # 0 = TOMLISH - # 1 = KEY a = {INT 1} {NEWLINE lf} - # 2 = NEWLINE lf - # 3 = KEY b = {INT 2} {NEWLINE lf} - # 4 = NEWLINE lf - # 5 = NEWLINE lf - - - #ARRAY is analogous to a Tcl list - #TABLE is analogous to a Tcl dict - #WS = inline whitespace - #KEY = bare key and value - #DQKEY = double quoted key and value - #SQKEY = single quoted key and value - #ITABLE = inline table (*can* be anonymous table) - # inline table values immediately create a table with the opening brace - # inline tables are fully defined between their braces, as are dotted-key subtables defined within - # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained - - set tags [list TOMLISH ARRAY TABLE ITABLE ANONTABLE WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] - #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) - #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) - set min_int -9223372036854775808 ;#-2^63 - set max_int +9223372036854775807 ;#2^63-1 - - proc Dolog {lvl txt} { - #return "$lvl -- $txt" - set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" - puts stderr $msg - } - logger::initNamespace ::tomlish - foreach lvl [logger::levels] { - interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl - log::logproc $lvl tomlish_log_$lvl - } - - #*** !doctools - #[subsection {Namespace tomlish}] - #[para] Core API functions for tomlish - #[list_begin definitions] - - proc tags {} { - return $::tomlish::tags - } - - #helper function for to_dict - proc _get_keyval_value {keyval_element} { - log::notice ">>> _get_keyval_value from '$keyval_element'<<<" - set found_value 0 - #find the value - # 3 is the earliest index at which the value could occur (depending on whitespace) - set found_sub [list] - if {[lindex $keyval_element 2] ne "="} { - error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list" - } - - #review - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists - } - - foreach sub [lrange $keyval_element 2 end] { - #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey - switch -exact -- [lindex $sub 0] { - STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { - set type [lindex $sub 0] - set value [lindex $sub 1] - set found_sub $sub - incr found_value 1 - } - default {} - } - } - if {!$found_value} { - error "tomlish Failed to find value element in KEY. '$keyval_element'" - } - if {$found_value > 1} { - error "tomlish Found multiple value elements in KEY, expected exactly one. '$keyval_element'" - } - - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - #simple (non-container, no-substitution) datatype - set result [list type $type value $value] - } - STRING - STRINGPART { - set result [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL - LITERALPART { - #REVIEW - set result [list type $type value $value] - } - TABLE { - #invalid? - error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" - } - ITABLE { - #This one should not be returned as a type value structure! - # - set result [::tomlish::to_dict [list $found_sub]] - } - ARRAY { - #we need to recurse to get the corresponding dict for the contained item(s) - #pass in the whole $found_sub - not just the $value! - set prev_tablenames_seen $tablenames_seen - set prev_tablenames_closed $tablenames_closed - set tablenames_seen [list] - set tablenames_closed [list] - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_seen $prev_tablenames_seen - set tablenames_closed $prev_tablenames_closed - } - MULTISTRING - MULTILITERAL { - #review - mapping these to STRING might make some conversions harder? - #if we keep the MULTI - we know we have to look for newlines for example when converting to json - #without specific types we'd have to check every STRING - and lose info about how best to map chars within it - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - } - default { - error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" - } - } - return $result - } - - proc _get_dottedkey_info {dottedkeyrecord} { - set key_hierarchy [list] - set key_hierarchy_raw [list] - if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { - error "tomlish _get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" - } - set compoundkeylist [lindex $dottedkeyrecord 1] - set expect_sep 0 - foreach part $compoundkeylist { - set parttag [lindex $part 0] - if {$parttag eq "WS"} { - continue - } - if {$expect_sep} { - if {$parttag ne "DOTSEP"} { - error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" - } - set expect_sep 0 - } else { - set val [lindex $part 1] - switch -exact -- $parttag { - KEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw $val - } - DQKEY { - lappend key_hierarchy [::tomlish::utils::unescape_string $val] - lappend key_hierarchy_raw \"$val\" - } - SQKEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw "'$val'" - } - default { - error "DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" - } - } - set expect_sep 1 - } - } - return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] - } - - - - #to_dict is a *basic* programmatic datastructure for accessing the data. - # produce a dictionary of keys and values from a tomlish tagged list. - # to_dict is primarily for reading toml data. - #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, - # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. - # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. - #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. - # - - #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types - #(ARRAYS can be mixed type) - #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form - #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? - - #Namespacing? - #ie note the difference: - #[Data] - #temp = { cpu = 79.5, case = 72.0} - # versus - #[Data] - #temps = [{cpu = 79.5, case = 72.0}] - proc to_dict {tomlish} { - - #keep track of which tablenames have already been directly defined, - # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' - #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. - #we don't error out just because a previous tablename segment has already appeared. - ##variable tablenames_seen [list] - if {[uplevel 1 [list info exists tablenames_seen]]} { - upvar tablenames_seen tablenames_seen - } else { - set tablenames_seen [list] ;#list of lists - } - if {[uplevel 1 [list info exists tablenames_closed]]} { - upvar tablenames_closed tablenames_closed - } else { - set tablenames_closed [list] ;#list of lists - } - - log::info "---> to_dict processing '$tomlish'<<<" - set items $tomlish - - foreach lst $items { - if {[lindex $lst 0] ni $::tomlish::tags} { - error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" - } - } - - if {[lindex $tomlish 0] eq "TOMLISH"} { - #ignore TOMLISH tag at beginning - set items [lrange $tomlish 1 end] - } - - set datastructure [dict create] - foreach item $items { - set tag [lindex $item 0] - #puts "...> item:'$item' tag:'$tag'" - switch -exact -- $tag { - KEY - DQKEY - SQKEY { - log::debug "---> to_dict item: processing $tag: $item" - set key [lindex $item 1] - if {$tag eq "DQKEY"} { - set key [::tomlish::utils::unescape_string $key] - } - #!todo - normalize key. (may be quoted/doublequoted) - - if {[dict exists $datastructure $key]} { - error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #lassign [_get_keyval_value $item] type val - set keyval_dict [_get_keyval_value $item] - dict set datastructure $key $keyval_dict - } - DOTTEDKEY { - log::debug "---> to_dict item processing $tag: $item" - set dkey_info [_get_dottedkey_info $item] - set dotted_key_hierarchy [dict get $dkey_info keys] - - #a.b.c = 1 - #table_key_hierarchy -> a b - #leafkey -> c - if {[llength $dotted_key_hierarchy] == 0} { - #empty?? probably invalid. review - #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively - error "DOTTED key has no parts - invalid? '$item'" - } elseif {[llength $dotted_key_hierarchy] == 1} { - #dottedkey is only a key - no table component - set table_hierarchy [list] - set leafkey [lindex $dotted_key_hierarchy 0] - } else { - set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set leafkey [lindex $dotted_key_hierarchy end] - } - - #ensure empty tables are still represented in the datastructure - #review - this seems unnecessary? - set pathkeys [list] - foreach k $table_hierarchy { - lappend pathkeys $k - if {![dict exists $datastructure {*}$pathkeys]} { - dict set datastructure {*}$pathkeys [list] - } else { - tomlish::log::notice "to_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" - } - } - #review? - if {[dict exists $datastructure {*}$table_hierarchy $leafkey]} { - error "Duplicate key '$table_hierarchy $leafkey'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #JMN test 2025 - if {[llength $table_hierarchy]} { - lappend tablenames_seen $table_hierarchy - } - - set keyval_dict [_get_keyval_value $item] - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - lappend tablenames_seen [list {*}$table_hierarchy $leafkey] - lappend tablenames_closed [list {*}$table_hierarchy $leafkey] - - #review - item is an ITABLE - we recurse here without datastructure context :/ - #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict - } else { - dict set datastructure {*}$table_hierarchy $leafkey $keyval_dict - } - - } - TABLE { - set tablename [lindex $item 1] - #set tablename [::tomlish::utils::tablename_trim $tablename] - set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - if {$norm_segments in $tablenames_seen} { - error "Table name '$tablename' has already been directly defined in the toml data. Invalid." - } - - log::debug "---> to_dict processing item $tag (name: $tablename): $item" - set name_segments [::tomlish::utils::tablename_split $tablename] ;#unnormalized - set last_seg "" - #toml spec rule - all segments mst be non-empty - #note that the results of tablename_split are 'raw' - ie some segments may be enclosed in single or double quotes. - - set table_key_sublist [list] - - foreach normseg $norm_segments { - lappend table_key_sublist $normseg - if {[dict exists $datastructure {*}$table_key_sublist]} { - #It's ok for this key to already exist *if* it was defined by a previous tablename or equivalent - #and if this key is longer - - #consider the following 2 which are legal: - #[table] - #x.y = 3 - #[table.x.z] - #k= 22 - - #equivalent - - #[table] - #[table.x] - #y = 3 - #[table.x.z] - #k=22 - - #illegal - #[table] - #x.y = 3 - #[table.x.y.z] - #k = 22 - ## - we should fail on encountering table.x.y because only table and table.x are effectively tables - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - set sublist_length [llength $table_key_sublist] - set found_testkey 0 - if {$table_key_sublist in $tablenames_seen} { - set found_testkey 1 - } else { - #see if it was defined by a longer entry - foreach seen_table_segments $tablenames_seen { - if {[llength $seen_table_segments] <= $sublist_length} { - continue - } - #each tablenames_seen entry is already a list of normalized segments - - #we could have [a.b.c.d] early on - # followed by [a.b] - which was still defined by the earlier one. - - set seen_longer [lrange $seen_segments 0 [expr {$sublist_length -1}]] - puts stderr "testkey:'$table_key_sublist' vs seen_match:'$seen_longer'" - if {$table_key_sublist eq $seen_longer} { - set found_testkey 1 - } - } - } - - if {$found_testkey == 0} { - #the raw table_key_hierarchy is better to display in the error message, although it's not the actual dict keyset - set msg "key $table_key_sublist already exists in datastructure, but wasn't defined by a supertable." - append msg \n "tablenames_seen:" \n - foreach ts $tablenames_seen { - append msg " " $ts \n - } - error $msg - } - } - - } - - #ensure empty tables are still represented in the datastructure - set table_keys [list] - foreach k $table_key_hierarchy { - lappend table_keys $k - if {![dict exists $datastructure {*}$table_keys]} { - dict set datastructure {*}$table_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (TABLE) subkey $table_keys already had data: [dict get $datastructure {*}$table_keys]" - } - } - - #We must do this after the key-collision test above! - lappend tablenames_seen $norm_segments - - - log::debug ">>> to_dict >>>>>>>>>>>>>>>>> table_key_hierarchy : $table_key_hierarchy" - - #now add the contained elements - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [_get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - set dotted_key_hierarchy [dict get $dkey_info keys] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - set leaf_key [lindex $dotted_key_hierarchy end] - - #ensure empty keys are still represented in the datastructure - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> $keyval_dict" - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - #JMN 2025 - lappend tablenames_seen [list {*}$table_key_hierarchy {*}$dkeys] - - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - lappend tablenames_seen [list {*}$table_key_hierarchy {*}$dkeys $leaf_key] - #if the keyval_dict is not a simple type x value y - then it's an inline table ? - #if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - } - - } - KEY - DQKEY - SQKEY { - #obsolete ? - set keyval_key [lindex $element 1] - if {$type eq "DQKEY"} { - set keyval_key [::tomlish::utils::unescape_string $keyval_key] - } - if {[dict exists $datastructure {*}$dotted_key_hierarchy $keyval_key]} { - error "Duplicate key '$dotted_key_hierarchy $key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$dotted_key_hierarchy $keyval_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in table context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - #now make sure we add an empty value if there were no contained elements! - #!todo. - } - ITABLE { - #SEP??? - set datastructure [list] - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [_get_dottedkey_info $element] - set dotted_key_hierarchy [dict get $dkey_info keys] - set leaf_key [lindex $dotted_key_hierarchy end] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - - #ensure empty keys are still represented in the datastructure - set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - } - ARRAY { - #arrays in toml are allowed to contain mixtures of types - set datastructure [list] - log::debug "--> processing array: $item" - - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - STRING { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - ITABLE { - #anonymous table - #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - lappend datastructure [::tomlish::to_dict [list $element]] ;#store itables within arrays as raw dicts (possibly empty) - } - TABLE { - #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review - #doesn't make sense as table needs a name? - #take as synonym for ITABLE? - error "to_dict TABLE within array unexpected" - } - ARRAY - MULTISTRING - MULTILITERAL { - #set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - } - WS - SEP - NEWLINE - COMMENT { - #ignore whitespace, commas, newlines and comments - } - default { - error "Unexpected value type '$type' found in array" - } - } - } - } - MULTILITERAL { - #triple squoted string - #first newline stripped only if it is the very first element - #(ie *immediately* following the opening delims) - #All whitespace other than newlines is within LITERALPARTS - # ------------------------------------------------------------------------- - #todo - consider extension to toml to allow indent-aware multiline literals - # how - propose as issue in toml github? Use different delim? e.g ^^^ ? - #e.g - # xxx=?'''abc - # def - # etc - # ''' - # - we would like to trimleft each line to the column following the opening delim - # ------------------------------------------------------------------------- - - log::debug "---> todict processing multiliteral: $item" - set parts [lrange $item 1 end] - if {[lindex $parts 0 0] eq "NEWLINE"} { - set parts [lrange $parts 1 end] ;#skip it - } - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - switch -exact -- $type { - LITERALPART { - append stringvalue [lindex $element 1] - } - NEWLINE { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - MULTISTRING { - #triple dquoted string - log::debug "---> to_dict processing multistring: $item" - set stringvalue "" - set idx 0 - set parts [lrange $item 1 end] - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted - switch -exact -- $type { - STRING { - #todo - do away with STRING ? - #we don't build MULTISTRINGS containing STRING - but should we accept it? - tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" - append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" - } - STRINGPART { - append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] - } - CONT { - #When the last non-whitespace character on a line is an unescaped backslash, - #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter - # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - append stringvalue "\\" ;#add the sep - } else { - #skip over ws without emitting - set idx [llength $parts] - } - } else { - set parts_til_nl [lrange $parts 0 $next_nl-1] - set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] - if {$non_ws >= 0} { - append stringvalue "\\" - } else { - #skip over ws on this line - set idx $next_nl - #then have to check each subsequent line until we get to first non-whitespace - set trimming 1 - while {$trimming && $idx < [llength $parts]} { - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - } else { - set idx [llength $parts] - } - set trimming 0 - } else { - set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - set trimming 0 - } else { - set idx $next_nl - #keep trimming - } - } - } - } - } - } - NEWLINE { - #if newline is first element - it is not part of the data of a multistring - if {$idx > 0} { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - } - WS { - append stringvalue [lindex $element 1] - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - WS - COMMENT - NEWLINE { - #ignore - } - default { - error "Unexpected tag '$tag' in Tomlish list '$tomlish'" - } - } - } - return $datastructure - } - - - proc _from_dictval_tomltype {parents tablestack keys typeval} { - set type [dict get $typeval type] - set val [dict get $typeval value] - switch -- $type { - ARRAY { - set subitems [list] - foreach item $val { - lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP - } - if {[lindex $subitems end] eq "SEP"} { - set subitems [lrange $subitems 0 end-1] - } - return [list ARRAY {*}$subitems] - } - ITABLE { - if {$val eq ""} { - return ITABLE - } else { - return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] - } - } - MULTISTRING { - #value is a raw string that isn't encoded as tomlish - #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format - #We need to convert controls in $val to escape sequences - except for newlines - # - #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) - #we could use a line-length limit to decide when to put in a "line ending backslash" - #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW - # - #TODO - set tomlpart "x=\"\"\"\\\n" - append tomlpart $val "\"\"\"" - set tomlish [tomlish::decode::toml $tomlpart] - #e.g if val = " etc\nblah" - #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } - #lindex 1 3 is the MULTISTRING tomlish list - return [lindex $tomlish 1 3] - } - MULTILITERAL { - #MLL string can contain newlines - but still no control chars - #todo - validate - set tomlpart "x='''\n" - append tomlpart $val ''' - set tomlish [tomlish::decode::toml $tomlpart] - return [lindex $tomlish 1 3] - } - LITERAL { - #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" - #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format - # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be - # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) - #we could choose to change the type to another format here when encountering invalid chars - but that seems - #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. - if {[string first ' $val] >=0} { - error "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" - } - #detect control chars other than tab - #for this we can use rawstring_to_Bstring_with_escaped_controls - even though this isn't a Bstring - #we are just using the map to detect a difference. - set testval [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val] - if {$testval ne $val} { - #some escaping would have to be done if this value was destined for a Bstring... - #therefor this string has controls and isn't suitable for a LITERAL according to the specs. - error "_from_dictval_tomltype error: control chars (other than tab) found in LITERAL value - cannot encode dict to TOML-VALID TOMLISH" - } - return [list LITERAL $val] - } - STRING { - return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] - } - INT { - if {![::tomlish::utils::is_int $val]} { - error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list INT $val] - } - FLOAT { - if {![::tomlish::utils::is_float $val]} { - error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list FLOAT $val] - } - default { - if {$type ni [::tomlish::tags]} { - error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" - } - return [list $type $val] - } - } - } - - proc _from_dictval {parents tablestack keys vinfo} { - set k [lindex $keys end] - if {[regexp {\s} $k] || [string first . $k] >= 0} {} - if {![::tomlish::utils::is_barekey $k]} { - #Any dot in the key would have been split by to_dict - so if it's present here it's part of this key - not a level separator! - #requires quoting - #we'll use a basic mechanism for now to determine the type of quoting - whether it has any single quotes or not. - #todo - more? - #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) - if {[string first ' $k] >=0} { - #basic string - } else { - #literal string - set K_PART [list SQKEY $k] - } - } else { - set K_PART [list KEY $k] - } - puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" - puts stderr "---tablestack: $tablestack---" - set result [list] - set lastparent [lindex $parents end] - if {$lastparent in [list "" do_inline]} { - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - set type [dict get $vinfo type] - #treat ITABLE differently? - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} - } else { - #set result [list TABLE $k {NEWLINE lf}] - if {$vinfo ne ""} { - - #set result [list DOTTEDKEY [list [list KEY $k]] = ] - #set records [list ITABLE] - - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - - if {$lastparent eq "do_inline"} { - set result [list DOTTEDKEY [list $K_PART] =] - set records [list ITABLE] - } else { - #review - quoted k ?? - set result [list TABLE $k {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $k]] - set records [list] - } - - - - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - dict for {vk vv} $vinfo { - if {[regexp {\s} $vk] || [string first . $vk] >= 0} { - set VK_PART [list SQKEY $vk] - } else { - set VK_PART [list KEY $vk] - } - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] - } else { - if {$vv eq ""} { - #experimental - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" - #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] - set tname [join [list {*}$keys $vk] .] - set record [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - } else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - set tablestack [list {*}$tablestack [list I $vk]] - } - } else { - if { 0 } { - #experiment.. sort of getting there. - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" - set tname [join [list {*}$keys $vk] .] - set record [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - - #review - todo? - set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] - lappend record {*}$dottedkey_value - - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - } - if {$dictidx != $lastidx} { - #lappend record SEP - if {$lastparent eq "do_inline"} { - lappend record SEP - } else { - lappend record {NEWLINE lf} - } - } - lappend records $record - incr dictidx - } - if {$lastparent eq "do_inline"} { - lappend result $records {NEWLINE lf} - } else { - lappend result {*}$records {NEWLINE lf} - } - } else { - if {$lastparent eq "do_inline"} { - lappend result DOTTEDKEY [list [list KEY $k]] = ITABLE {NEWLINE lf} - } else { - lappend result TABLE $k {NEWLINE lf} - } - } - } - } else { - #lastparent is not toplevel "" or "do_inline" - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result {*}$sublist - } else { - if {$lastparent eq "TABLE"} { - #review - dict for {vk vv} $vinfo { - set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] - lappend result [list DOTTEDKEY [list [list KEY $vk]] = $dottedkey_value {NEWLINE lf}] - } - } else { - if {$vinfo ne ""} { - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - set sub [list] - #REVIEW - #set result $lastparent ;#e.g sets ITABLE - set result ITABLE - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - dict for {vk vv} $vinfo { - if {[regexp {\s} $vk] || [string first . $vk] >=0} { - set VK_PART [list SQKEY $vk] - } else { - set VK_PART [list KEY $vk] - } - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART] = $sublist] - } else { - if {$vv eq ""} { - #can't just uninline at this level - #we need a better method to query main dict for uninlinability at each level - # (including what's been inlined already) - #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - # puts stderr "_from_dictval uninline2 KEY $keys" - # set tname [join [list {*}$keys $vk] .] - # set record [list TABLE $tname {NEWLINE lf}] - # set tablestack [list {*}$tablestack [list T $vk]] - #} else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - #} - } else { - #set sub [_from_dictval ITABLE $vk $vv] - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - if {$dictidx != $lastidx} { - lappend record SEP - } - lappend result $record - incr dictidx - } - } else { - puts stderr "table x-1" - lappend result DOTTEDKEY [list $K_PART] = ITABLE - } - } - } - } - return $result - } - - - proc from_dict {d} { - #consider: - # t1={a=1,b=2} - # x = 1 - #If we represent t1 as an expanded table we get - # [t1] - # a=1 - # b=2 - # x=1 - # --- which is incorrect - as x was a toplevel key like t1! - #This issue doesn't occur if x is itself an inline table - # t1={a=1,b=2} - # x= {no="problem"} - # - # (or if we were to reorder x to come before t1) - - #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} - #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, - #which is unpreferred here. - - #A possible solution: - #scan the top level to see if all (trailing) elements are themselves dicts - # (ie not of form {type XXX value yyy}) - # - # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements - #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - - #set root_has_values 0 - #approach 1) - the naive approach - forces inline when not always necessary - #dict for {k v} $d { - # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { - # set root_has_values 1 - # break - # } - #} - - - #approach 2) - track the position of last {type x value y} in the dictionary built by to_dict - # - still not perfect. Inlines dotted tables unnecessarily - #This means from_dict doesn't produce output optimal for human editing. - set last_simple [tomlish::dict::last_tomltype_posn $d] - - - ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values - #Any keys that are themselves tables - will need to be represented inline - #to avoid reordering, or incorrect assignment of plain values to the wrong table. - - ## set parent "" - #all toplevel keys in the dict structure can represent subtables. - #we are free to use {[tablename]\n} syntax for toplevel elements. - - - set tomlish [list TOMLISH] - set dictposn 0 - set tablestack [list [list T root]] ;#todo - dict for {t tinfo} $d { - if {$last_simple > $dictposn} { - set parents [list do_inline] - } else { - set parents [list ""] - } - set keys [list $t] - set trecord [_from_dictval $parents $tablestack $keys $tinfo] - lappend tomlish $trecord - incr dictposn - } - return $tomlish - } - - proc json_to_toml {json} { - #*** !doctools - #[call [fun json_to_toml] [arg json]] - #[para] - - set tomlish [::tomlish::from_json $json] - set toml [::tomlish::to_toml $tomlish] - } - - #TODO use huddle? - proc from_json {json} { - #set jstruct [::tomlish::json_struct $json] - #return [::tomlish::from_json_struct $jstruct] - package require huddle - package require huddle::json - set h [huddle::json::json2huddle parse $json] - - } - - proc from_json_struct {jstruct} { - package require fish::json_toml - return [fish::json_toml::jsonstruct2tomlish $jstruct] - } - - proc toml_to_json {toml} { - set tomlish [::tomlish::from_toml $toml] - return [::tomlish::get_json $tomlish] - } - - proc get_json {tomlish} { - package require fish::json - set d [::tomlish::to_dict $tomlish] - #return [::tomlish::dict_to_json $d] - return [fish::json::from "struct" $d] - } - - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -namespace eval tomlish::build { - #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness - # take a value of the appropriate type and wrap as a tomlish tagged item - proc STRING {s} { - return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] - } - proc LITERAL {litstring} { - - } - - proc INT {i} { - #whole numbers, may be prefixed with a + or - - #Leading zeros are not allowed - #Hex,octal binary forms are allowed (toml 1.0) - #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) - #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. - # - We should probably raise an error for number larger than this and suggest the user supply it as a string? - if {[tcl::string::last , $i] > -1} { - error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" - } - if {![::tomlish::utils::int_validchars $i]} { - error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" - } - - if {[::tomlish::utils::is_int $i]} { - return [list INT $i] - } else { - error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" - } - - } - - proc FLOAT {f} { - #convert any non-lower case variants of special values to lowercase for Toml - if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { - return [list FLOAT [tcl::string::tolower $f]] - } - if {[::tomlish::utils::is_float $f]} { - return [list FLOAT $f] - } else { - error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" - } - } - - proc DATETIME {str} { - if {[::tomlish::utils::is_datetime $str]} { - return [list DATETIME $str] - } else { - error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" - } - } - - proc BOOLEAN {b} { - #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false - if {![tcl::string::is boolean -strict $b]} { - error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" - } else { - if {$b && 1} { - return [::list BOOL true] - } else { - return [::list BOOL false] - } - } - } - - #REVIEW - #Take tablename followed by - # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} - # (accept also key value {STRING }) - # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types - proc _table {name args} { - set pairs [list] - foreach t $args { - if {[llength $t] == 4} { - if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { - error "Only items tagged as KEY = currently accepted as name-value pairs for table command" - } - lassign $t _k keystr _eq valuepart - if {[llength $valuepart] != 2} { - error "supplied value must be typed. e.g {INT 1} or {STRING test}" - } - lappend pairs [list KEY $keystr = $valuepart] - } elseif {[llength $t] == 2} { - #!todo - type heuristics - lassign $t n v - lappend pairs [list KEY $n = [list STRING $v]] - } else { - error "'KEY = { toml but - # the first newline is not part of the data. - # we elect instead to maintain a basic LITERALPART that must not contain newlines.. - # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, - #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. - set literal "" - foreach part [lrange $item 1 end] { - append literal [::tomlish::encode::tomlish [list $part] $nextcontext] - } - append toml '''$literal''' - } - INT - - BOOL - - FLOAT - - DATETIME { - append toml [lindex $item 1] - } - INCOMPLETE { - error "cannot process tomlish term tagged as INCOMPLETE" - } - COMMENT { - append toml "#[lindex $item 1]" - } - BOM { - #Byte Order Mark may appear at beginning of a file. Needs to be preserved. - append toml "\uFEFF" - } - default { - error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." - } - } - - } - return $toml - } - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] -} -#fish toml from tomlish - -#(encode tomlish as toml) -interp alias {} tomlish::to_toml {} tomlish::encode::tomlish - -# - - -namespace eval tomlish::decode { - #*** !doctools - #[subsection {Namespace tomlish::decode}] - #[para] - #[list_begin definitions] - - #return a Tcl list of tomlish tokens - #i.e get a standard list of all the toml terms in string $s - #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. - #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) - # ---------------------------------------------------------------------------------------------- - # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! - # e.g we deliberately don't check certain things such as duplicate table declarations here. - # ---------------------------------------------------------------------------------------------- - #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. - # (e.g perhaps a toml editor to highlight violations for fixing) - # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. - # e.g dicts or an object oriented structure - #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage - #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc - #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. - # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) - #If we were to unescape a tab character for example - # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. - # For this reason, we also do absolutely no line-ending transformations based on platform. - # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' - - proc toml {args} { - #*** !doctools - #[call [fun toml] [arg arg...]] - #[para] return a Tcl list of tomlish tokens - - set s [join $args \n] - - namespace upvar ::tomlish::parse is_parsing is_parsing - set is_parsing 1 - - if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { - tomlish::parse::spacestack destroy - } - struct::stack ::tomlish::parse::spacestack - - namespace upvar ::tomlish::parse last_space_action last_space_action - namespace upvar ::tomlish::parse last_space_type last_space_type - - namespace upvar ::tomlish::parse tok tok - set tok "" - - namespace upvar ::tomlish::parse type type - namespace upvar ::tomlish::parse tokenType tokenType - ::tomlish::parse::set_tokenType "" - namespace upvar ::tomlish::parse tokenType_list tokenType_list - set tokenType [list] ;#Flat (un-nested) list of tokentypes found - - namespace upvar ::tomlish::parse lastChar lastChar - set lastChar "" - - - set result "" - namespace upvar ::tomlish::parse nest nest - set nest 0 - - namespace upvar ::tomlish::parse v v ;#array keyed on nest level - - - set v(0) {TOMLISH} - array set s0 [list] ;#whitespace data to go in {SPACE {}} element. - set parentlevel 0 - - namespace upvar ::tomlish::parse i i - set i 0 - - namespace upvar ::tomlish::parse state state - - namespace upvar ::tomlish::parse braceCount braceCount - set barceCount 0 - namespace upvar ::tomlish::parse bracketCount bracketCount - set bracketCount 0 - - set sep 0 - set r 1 - namespace upvar ::tomlish::parse token_waiting token_waiting - set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. - - - set state "table-space" - ::tomlish::parse::spacestack push {type space state table-space} - namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) - set linenum 1 - - set ::tomlish::parse::state_list [list] - try { - while {$r} { - set r [::tomlish::parse::tok $s] - #puts stdout "got tok: '$tok' while parsing string '$s' " - set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' - - - #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" - #puts "-->tok: $tok tokenType='$tokenType'" - set prevstate $state - set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] - #review goNextState could perform more than one space_action - set space_action [dict get $transition_info space_action] - set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below - - if {[tcl::string::match "err-*" $state]} { - ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" - lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] - return $v(0) - } - # --------------------------------------------------------- - #NOTE there may already be a token_waiting at this point - #set_token_waiting can raise an error here, - # in which case the space_action branch needs to be rewritten to handle the existing token_waiting - # --------------------------------------------------------- - - if {$space_action eq "pop"} { - #pop_trigger_tokens: newline tablename endarray endinlinetable - #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. - set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like squote_seq need to do their own append - switch -exact -- $tokenType { - squote_seq { - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the xxx-squote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - switch -- $tok { - ' { - tomlish::parse::set_token_waiting type startsquote value $tok complete 1 startindex [expr {$i -1}] - } - '' { - #review - we should perhaps return double_squote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] - } - ''' { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] - } - '''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 4 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the last for next loop - } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } - } - default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" - } - } - } - ''''' { - switch -exact -- $prevstate { - leading-squote-space { - error "---- 5 squotes from leading-squote-space - shouldn't get here" - #we should have emitted the triple and left the following squotes for next loop - } - trailing-squote-space { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } - } - default { - error "--- unexpected popped due to squote_seq but came from state '$prevstate' should have been leading-squote-space or trailing-squote-space" - } - } - } - } - puts stderr "tomlish::decode::toml ---- HERE squote_seq pop <$tok>" - } - triple_squote { - #presumably popping multiliteral-space - ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTILITERAL { - lappend merged $part - } - LITERALPART { - if {$lasttype eq "LITERALPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($next)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - equal { - #pop caused by = - switch -exact -- $prevstate { - dottedkey-space { - tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - dottedkey-space-tail { - #experiment? - tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - } - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - tablename { - #note: a tablename only 'pops' if we are greater than zero - error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" - } - tablearrayname { - #!review - tablearrayname different to tablename regarding push/pop? - #note: a tablename only 'pops' if we are greater than zero - error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" - } - endarray { - #nothing to do here. - } - comma { - #comma for inline table will pop the keyvalue space - lappend v($nest) "SEP" - } - endinlinetable { - ::tomlish::log::debug "---- endinlinetable for last_space_action pop" - } - endmultiquote { - ::tomlish::log::debug "---- endmultiquote for last_space_action 'pop'" - } - default { - error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" - } - } - if {$do_append_to_parent} { - #e.g squote_seq does it's own appends as necessary - so won't get here - lappend v($parentlevel) [set v($nest)] - } - - incr nest -1 - - } elseif {$last_space_action eq "push"} { - set prevnest $nest - incr nest 1 - set v($nest) [list] - # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname - - - switch -exact -- $tokenType { - squote_seq_begin { - #### - if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { - lassign [dict get $transition_info starttok] starttok_type starttok_val - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType $starttok_type - set tok $starttok_val - } - } - squotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - dquotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - XXXdquotedkey - XXXitablequotedkey { - #todo - set v($nest) [list DQKEY $tok] ;#$tok is the keyname - } - barekey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - set waiting [tomlish::parse::get_token_waiting] - if {[llength $waiting]} { - set i [dict get $waiting startindex] - tomlish::parse::clear_token_waiting - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } else { - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - } - startsquote { - #JMN - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - tablename { - #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! - #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish - # back to toml file will be identical. - #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. - # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names. - - #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, - # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the - # tomlish list? - - set test_only [::tomlish::utils::tablename_trim $tok] - ::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$test_only'" - set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name - #note also that equivalent tablenames may have different toml representations even after being trimmed! - #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) - #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. - } - tablearrayname { - set test_only [::tomlish::utils::tablename_trim $tok] - puts stdout "trimmed (but not normalized) tablearrayname: '$test_only'" - set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name - } - startarray { - set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. - } - startinlinetable { - set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. - } - startmultiquote { - ::tomlish::log::debug "---- push trigger tokenType startmultiquote" - set v($nest) [list MULTISTRING] ;#container for STRINGPART, WS, CONT, NEWLINE - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERAL - } - default { - error "---- push trigger tokenType '$tokenType' not yet implemented" - } - } - - } else { - #no space level change - switch -exact -- $tokenType { - squotedkey { - puts "---- squotedkey in state $prevstate (no space level change)" - lappend v($nest) [list SQKEY $tok] - } - dquotedkey { - puts "---- dquotedkey in state $prevstate (no space level change)" - lappend v($nest) [list DQKEY $tok] - } - barekey { - lappend v($nest) [list KEY $tok] - } - dotsep { - lappend v($nest) [list DOTSEP] - } - starttablename { - #$tok is triggered by the opening bracket and sends nothing to output - } - starttablearrayname { - #$tok is triggered by the double opening brackets and sends nothing to output - } - tablename - tablenamearray { - error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" - #set v($nest) [list TABLE $tok] - } - endtablename - endtablearrayname { - #no output into the tomlish list for this token - } - startinlinetable { - puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" - } - startquote { - switch -exact -- $newstate { - string-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "string" - set tok "" - } - quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "dquotedkey" - set tok "" - } - XXXitable-quoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablequotedkey" - set tok "" - } - default { - error "---- startquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - startsquote { - switch -exact -- $newstate { - literal-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "literal" - set tok "" - } - squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - XXXitable-squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "itablesquotedkey" - set tok "" - } - multiliteral-space { - #false alarm squote returned from squote_seq pop - ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" - #(single squote - not terminating space) - lappend v($nest) [list LITERALPART '] - #may need to be joined on pop if there are neighbouring LITERALPARTs - } - default { - error "---- startsquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - startmultiquote { - #review - puts stderr "---- got startmultiquote in state $prevstate (no space level change)" - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "stringpart" - set tok "" - } - endquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - endmultiquote { - #JMN!! - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } - double_squote { - switch -exact -- $prevstate { - keyval-value-expected { - lappend v($nest) [list LITERAL ""] - } - multiliteral-space { - #multiliteral-space to multiliteral-space - lappend v($nest) [list LITERALPART ''] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - multistring { - #review - lappend v($nest) [list MULTISTRING $tok] - } - stringpart { - lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly - } - multiliteral { - lappend v($nest) [LIST MULTILITERAL $tok] - } - literalpart { - lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - itablequotedkey { - - } - untyped_value { - #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. - if {$tok in {true false}} { - set tag BOOL - } elseif {[::tomlish::utils::is_int $tok]} { - set tag INT - } elseif {[::tomlish::utils::is_float $tok]} { - set tag FLOAT - } elseif {[::tomlish::utils::is_datetime $tok]} { - set tag DATETIME - } else { - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" - } - lappend v($nest) [list $tag $tok] - - } - comment { - #puts stdout "----- comment token returned '$tok'------" - lappend v($nest) [list COMMENT "$tok"] - } - equal { - #we append '=' to the nest so that any surrounding whitespace is retained. - lappend v($nest) = - } - comma { - lappend v($nest) SEP - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - whitespace { - lappend v($nest) [list WS $tok] - } - continuation { - lappend v($nest) CONT - } - bom { - lappend v($nest) BOM - } - eof { - #ok - nothing more to add to the tomlish list. - #!todo - check previous tokens are complete/valid? - } - default { - error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - - if {!$next_tokenType_known} { - ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" - ::tomlish::parse::set_tokenType "" - set tok "" - } - - if {$state eq "end-state"} { - break - } - - - } - - #while {$nest > 0} { - # lappend v([expr {$nest -1}]) [set v($nest)] - # incr nest -1 - #} - while {[::tomlish::parse::spacestack size] > 1} { - ::tomlish::parse::spacestack pop - lappend v([expr {$nest -1}]) [set v($nest)] - incr nest -1 - - #set parent [spacestack peek] ;#the level being appended to - #lassign $parent type state - #if {$type eq "space"} { - # - #} elseif {$type eq "buffer"} { - # lappend v([expr {$nest -1}]) {*}[set v($nest)] - #} else { - # error "invalid spacestack item: $parent" - #} - } - - } finally { - set is_parsing 0 - } - return $v(0) - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] -} -#decode toml to tomlish -interp alias {} tomlish::from_toml {} tomlish::decode::toml - -namespace eval tomlish::utils { - #*** !doctools - #[subsection {Namespace tomlish::utils}] - #[para] - #[list_begin definitions] - - - #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace - # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] - #trimmed, the tablename becomes {a.b.c} - # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] - #ie whitespace is only irrelevant if it's outside a quoted segment - #trimmed, the tablename becomes {a.b."c etc "} - proc tablename_trim {tablename} { - set segments [tablename_split $tablename false] - set trimmed_segments [list] - foreach seg $segments { - lappend trimmed_segments [::string trim $seg " \t"] - } - return [join $trimmed_segments .] - } - - #basic generic quote matching for single and double quotes - #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes - proc tok_in_quotedpart {tok} { - set sLen [tcl::string::length $tok] - set quote_type "" - set had_slash 0 - for {set i 0} {$i < $sLen} {incr i} { - set c [tcl::string::index $tok $i] - if {$quote_type eq ""} { - if {$had_slash} { - #don't enter quote mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - set quote_type dq - } - sq { - set quote_type sq - } - bsl { - set had_slash 1 - } - } - } - } else { - if {$had_slash} { - #don't leave quoted mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - if {$quote_type eq "dq"} { - set quote_type "" - } - } - sq { - if {$quote_type eq "sq"} { - set quote_type "" - } - } - bsl { - set had_slash 1 - } - } - } - } - } - return $quote_type ;#dq | sq - } - - #utils::tablename_split - proc tablename_split {tablename {normalize false}} { - #we can't just split on . because we have to handle quoted segments which may contain a dot. - #eg {dog."tater.man"} - set sLen [tcl::string::length $tablename] - set segments [list] - set mode "unknown" ;#5 modes: unknown, quoted,litquoted, unquoted, syntax - #quoted is for double-quotes, litquoted is for single-quotes (string literal) - set seg "" - for {set i 0} {$i < $sLen} {incr i} { - - if {$i > 0} { - set lastChar [tcl::string::index $tablename [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $tablename $i] - - if {$c eq "."} { - switch -exact -- $mode { - unquoted { - #dot marks end of segment. - lappend segments $seg - set seg "" - set mode "unknown" - } - quoted { - append seg $c - } - unknown { - lappend segments $seg - set seg "" - } - litquoted { - append seg $c - } - default { - #mode: syntax - #we got our dot. - the syntax mode is now satisfied. - set mode "unknown" - } - } - } elseif {($c eq "\"") && ($lastChar ne "\\")} { - if {$mode eq "unknown"} { - if {[tcl::string::trim $seg] ne ""} { - #we don't allow a quote in the middle of a bare key - error "tablename_split. character '\"' invalid at this point in tablename. tablename: '$tablename'" - } - set mode "quoted" - set seg "\"" - } elseif {$mode eq "unquoted"} { - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg - } - - set seg "" - set mode "syntax" ;#make sure we only accept a dot or end-of-data now. - } elseif {$mode eq "litquoted"} { - append seg $c - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } elseif {($c eq "\'")} { - if {$mode eq "unknown"} { - append seg $c - set mode "litquoted" - } elseif {$mode eq "unquoted"} { - #single quote inside e.g o'neill - append seg $c - } elseif {$mode eq "quoted"} { - append seg $c - - } elseif {$mode eq "litquoted"} { - append seg $c - #no normalization to do - lappend segments $seg - set seg "" - set mode "syntax" - } elseif {$mode eq "syntax"} { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" - } - - } elseif {$c in [list " " \t]} { - if {$mode eq "syntax"} { - #ignore - } else { - append seg $c - } - } else { - if {$mode eq "syntax"} { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" - } - if {$mode eq "unknown"} { - set mode "unquoted" - } - append seg $c - } - if {$i == $sLen-1} { - #end of data - ::tomlish::log::debug "End of data: mode='$mode'" - #REVIEW - we can only end up in unquoted or syntax here? are other branches reachable? - switch -exact -- $mode { - quoted { - if {$c ne "\""} { - error "tablename_split. missing closing double-quote in a segment. tablename: '$tablename'" - } - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - #lappend segments [subst -nocommands -novariables [::string range $seg 1 end-1]] ;#wrong - } else { - lappend segments $seg - } - } - litquoted { - set trimmed_seg [tcl::string::trim $seg] - if {[tcl::string::index $trimmed_seg end] ne "\'"} { - error "tablename_split. missing closing single-quote in a segment. tablename: '$tablename'" - } - lappend segments $seg - } - unquoted - unknown { - lappend segments $seg - } - syntax { - #ok - segment already lappended - } - default { - lappend segments $seg - } - } - } - } - foreach seg $segments { - set trimmed [tcl::string::trim $seg " \t"] - #note - we explicitly allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - #if {$trimmed in [list "''" "\"\""]} { - # puts stderr "tablename_split. warning - Empty quoted string as tablename segment" - #} - if {$trimmed eq "" } { - error "tablename_split. Empty segment found. tablename: '$tablename' segments [llength $segments] ($segments)" - } - } - return $segments - } - - proc unicode_escape_info {slashu} { - #!todo - # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and - # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) - # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive - #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} - if {[tcl::string::match {\\u*} $slashu]} { - set exp {^\\u([0-9a-fA-F]{4}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %4x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } - } else { - return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] - } - } elseif {[tcl::string::match {\\U*} $slashu]} { - set exp {^\\U([0-9a-fA-F]{8}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %8x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } else { - return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] - } - } - } else { - return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] - } - } else { - return [list err [list reason "Supplied string did not start with \\u or \\U" ]] - } - - } - - #Note that unicode characters don't *have* to be escaped. - #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. - #- an inverse of unescape_string would encode all unicode chars unnecessarily. - #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc - #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. - #REVIEW - provide it anyway? When would it be desirable to use? - - variable Bstring_control_map [list\ - \b {\b}\ - \n {\n}\ - \r {\r}\ - \" {\"}\ - \x1b {\e}\ - \\ "\\\\"\ - ] - #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ - #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - for {set cdec 0} {$cdec <= 8} {incr cdec} { - set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh - } - for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { - set hhhh [format %.4X $cdec] - lappend Bstring_control_map [format %c $cdec] \\u$hhhh - } - # \u007F = 127 - lappend Bstring_control_map [format %c 127] \\u007F - - #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! - #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) - #for example - can be used by from_dict to produce valid Bstring data for a tomlish record - proc rawstring_to_Bstring_with_escaped_controls {str} { - #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. - #we'll use a string map with an explicit list rather than algorithmic at runtime - # - the string map is probably more performant than splitting a string, especially if it's large - variable Bstring_control_map - return [string map $Bstring_control_map $str] - } - - #review - unescape what string? Bstring vs MLBstring? - #we should be specific in the function naming here - #used by to_dict - so part of validation? - REVIEW - proc unescape_string {str} { - #note we can't just use Tcl subst because: - # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. - # it would strip out backslashes inappropriately: e.g "\j" becomes just j - # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn - # it replaces \ with a single whitespace (trailing backslash) - #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh - - set buffer "" - set buffer4 "" ;#buffer for 4 hex characters following a \u - set buffer8 "" ;#buffer for 8 hex characters following a \u - - set sLen [tcl::string::length $str] - - #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc - set slash_active 0 - set unicode4_active 0 - set unicode8_active 0 - - ::tomlish::log::debug "unescape_string. got len [string length str] str $str" - - #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? - set i 0 - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $str [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $str $i] - #::tomlish::log::debug "unescape_string. got char $c" ;#too much? - - #---------------------- - #as we are 'unescaping' - should we really be testing for existing values that should have been escaped? - #this test looks incomplete anyway REVIEW - scan $c %c n - if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { - #we don't expect unescaped unicode characters from 0000 to 001F - - #*except* for raw tab (which is whitespace) and newlines - error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" - } - #---------------------- - - incr i ;#must incr here because we do'returns'inside the loop - if {$c eq "\\"} { - if {$slash_active} { - append buffer "\\" - set slash_active 0 - } elseif {$unicode4_active} { - error "unescape_string. unexpected case slash during unicode4 not yet handled" - } elseif {$unicode8_active} { - error "unescape_string. unexpected case slash during unicode8 not yet handled" - } else { - # don't output anything (yet) - set slash_active 1 - } - } else { - if {$unicode4_active} { - if {[tcl::string::length $buffer4] < 4} { - append buffer4 $c - } - if {[tcl::string::length $buffer4] == 4} { - #we have a \uHHHH to test - set unicode4_active 0 - set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$unicode8_active} { - if {[tcl::string::length $buffer8] < 8} { - append buffer8 $c - } - if {[tcl::string::length $buffer8] == 8} { - #we have a \UHHHHHHHH to test - set unicode8_active 0 - set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$slash_active} { - set slash_active 0 - set ctest [tcl::string::map {{"} dq} $c] - switch -exact -- $ctest { - dq { - set e "\\\"" - append buffer [subst -nocommand -novariable $e] - } - b - t - n - f - r { - set e "\\$c" - append buffer [subst -nocommand -novariable $e] - } - u { - set unicode4_active 1 - set buffer4 "" - } - U { - set unicode8_active 1 - set buffer8 "" - } - default { - set slash_active 0 - #review - toml spec says all other escapes are reserved - #and if they are used TOML should produce an error. - #we leave detecting this for caller for now - REVIEW - append buffer "\\" - append buffer $c - } - } - } else { - append buffer $c - } - } - } - #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" - if {$unicode4_active} { - error "End of string reached before complete unicode escape sequence \uHHHH" - } - if {$unicode8_active} { - error "End of string reached before complete unicode escape sequence \UHHHHHHHH" - } - if {$slash_active} { - append buffer "\\" - } - return $buffer - } - - #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) - #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, - #e.g squoted vs dquoted vs barekey. - proc normalize_key {rawkey} { - set c1 [tcl::string::index $rawkey 0] - set c2 [tcl::string::index $rawkey end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes allowed within it. - set key [tcl::string::range $rawkey 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - # - set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only - #e.g key could have mix of \UXXXXXXXX escapes and unicode chars - #or mix of \t and literal tabs. - #unescape to convert all to literal versions for comparison - set key [::tomlish::utils::unescape_string $keydata] - #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. - } else { - set key $rawkey - } - return $key - } - - proc string_to_slashu {string} { - set rv {} - foreach c [split $string {}] { - scan $c %c cdec - if {$cdec > 65535} { - append rv {\U} [format %.8X $cdec] - } else { - append rv {\u} [format %.4X $cdec] - } - } - return $rv - } - - #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. - #This is used for display purposes only (error msgs) - proc nonprintable_to_slashu {s} { - set res "" - foreach i [split $s ""] { - scan $i %c cdec - - set printable 0 - if {($cdec>31) && ($cdec<127)} { - set printable 1 - } - if {$printable} { - append res $i - } else { - if {$cdec > 65535} { - append res \\U[format %.8X $cdec] - } else { - append res \\u[format %.4X $cdec] - } - } - } - set res - } ;# initial version from tcl wiki RS - - #check if str is valid for use as a toml bare key - #Early toml versions? only allowed letters + underscore + dash - proc is_barekey1 {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } else { - set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters match the regexp - return 1 - } else { - return 0 - } - } - } - - #from toml.abnf in github.com/toml-lang/toml - #unquoted-key = 1*unquoted-key-char - #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ - #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions - #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block - #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon - #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics - #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators - #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols - #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation - #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank - #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space - #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - variable re_barekey - set ranges [list] - lappend ranges {a-zA-Z0-9\_\-} - lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions - lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block - lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon - lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics - lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators - lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols - lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation - lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank - lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space - lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - set re_barekey {^[} - foreach r $ranges { - append re_barekey $r - } - append re_barekey {]+$} - - proc is_barekey {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } - variable re_barekey - return [regexp $re_barekey $str] - } - - #test only that the characters in str are valid for the toml specified type 'integer'. - proc int_validchars1 {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - #add support for hex,octal,binary 0x.. 0o.. 0b... - proc int_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_int {str} { - set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - - # --------------------------------------- - #check for leading zeroes in non 0x 0b 0o - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { - return 0 - } - # --------------------------------------- - - #check +,- only occur in the first position. - if {[tcl::string::last - $str] > 0} { - return 0 - } - if {[tcl::string::last + $str] > 0} { - return 0 - } - set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores - #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) - if {![tcl::string::is integer -strict $numeric_value]} { - return 0 - } - #!todo - check bounds only based on some config value - #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements. - #presumably very large numbers would have to be supplied in a toml file as strings. - #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max - if {$numeric_value > $::tomlish::max_int} { - return 0 - } - if {$numeric_value < $::tomlish::min_int} { - return 0 - } - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'float'. - proc float_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { - return 1 - } else { - #only allow lower case for these special values - as per Toml 1.0 spec - if {$str ni {inf +inf -inf nan +nan -nan}} { - return 0 - } else { - return 1 - } - } - } - - proc is_float {str} { - set matches [regexp -all {[eE0-9\_\-\+\.]} $str] - #don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) - if {$str in {inf +inf -inf nan +nan -nan}} { - return 1 - } - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) - #Toml spec also disallows leading zeros in the exponent part - #... but this seems less interoperable (some libraries generate leading zeroes in exponents) - #for now we will allow leading zeros in exponents - #!todo - configure 'strict' option to disallow? - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - set r {([0-9])*} - regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E - set z {([0])*} - regexp $z $intpart leadingzeros - if {[tcl::string::length $leadingzeros] > 1} { - return 0 - } - #for floats, +,- may occur in multiple places - #e.g -2E-22 +3e34 - #!todo - check bounds ? - - #strip underscores for tcl double check - set check [tcl::string::map {_ ""} $str] - #string is double accepts inf nan +NaN etc. - if {![tcl::string::is double $check]} { - return 0 - } - - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'datetime'. - proc datetime_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - #review - we - proc is_datetime {str} { - #e.g 1979-05-27 - #e.g 1979-05-27T00:32:00Z - #e.g 1979-05-27 00:32:00-07:00 - #e.g 1979-05-27 00:32:00+10:00 - #e.g 1979-05-27 00:32:00.999999-07:00 - - #review - #minimal datetimes? - # 2024 ok - shortest valid 4 digit year? - # 02:00 ok - # 05-17 ok - if {[string length $str] < 4} { - return 0 - } - - set matches [regexp -all {[zZtT0-9\-\+\.:]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - #!todo - use full RFC 3339 parser? - lassign [split $str T] datepart timepart - #!todo - what if the value is 'time only'? - - #Tcl's free-form clock scan (no -format option) is deprecated - # - #if {[catch {clock scan $datepart} err]} { - # puts stderr "tcl clock scan failed err:'$err'" - # return 0 - #} - - #!todo - verify time part is reasonable - } else { - return 0 - } - return 1 - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] -} - -namespace eval tomlish::parse { - #*** !doctools - #[subsection {Namespace tomlish::parse}] - #[para] - #[list_begin definitions] - - #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. - #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: - # - e.g some kind of backtracking required if using an ABNF parser? - #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" - #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' - - #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? - - #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) - - - variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text - - variable state - # states: - # table-space, itable-space, array-space - # value-expected, keyval-syntax, - # quoted-key, squoted-key - # string-state, literal-state, multistring... - # - # notes: - # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack - - # - # value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax - # - #stateMatrix defines for each state, actions to take for each possible token. - #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. - #dual-element actions are a push instruction and the name of the space to push on the stack. - # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) - - # -- --- --- --- --- --- - #token/state naming guide - # -- --- --- --- --- --- - #tokens : underscore separated or bare name e.g newline, start_quote, start_squote - #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence - #states : always contain at least one dash e.g err-state, table-space - #instructions - # -- --- --- --- --- --- - - - #stateMatrix dict of elements mapping current state to next state based on returned tokens - # current-state {token-encountered next-state ... } - # where next-state can be a 1 or 2 element list. - #If 2 element - the first item is an instruction (ucase) - #If 1 element - it is either a lowercase dashed state name or an ucase instruction - #e.g {PUSHSPACE } or POPSPACE or SAMESPACE - - - #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases - - variable stateMatrix - set stateMatrix [dict create] - - #xxx-space vs xxx-syntax inadequately documented - TODO - - # --------------------------------------------------------------------------------------------------------------# - # incomplete example of some state starting at table-space - # --------------------------------------------------------------------------------------------------------------# - # ( = -> value-expected) - # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) - # keyval-space (autotransition on push ^) - # table-space (barekey^) (startdquote -> dquoted-key ^) - # --------------------------------------------------------------------------------------------------------------# - - dict set stateMatrix\ - table-space { - bom "table-space"\ - whitespace "table-space"\ - newline "table-space"\ - barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ - dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXstartquote "quoted-key"\ - XXXstartsquote "squoted-key"\ - comment "table-space"\ - starttablename "tablename-state"\ - starttablearrayname "tablearrayname-state"\ - startmultiquote "err-state"\ - endquote "err-state"\ - comma "err-state"\ - eof "end-state"\ - equal "err-state"\ - cr "err-lonecr"\ - } - - #itable-space/ curly-syntax : itables - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - XXXstartquote "quoted-key"\ - XXXstartsquote {TOSTATE "squoted-key" comment "jn-testing"}\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-space starttok {squote_seq "'"}} - - - dict set stateMatrix\ - keyval-space {\ - whitespace "keyval-syntax"\ - equal "keyval-value-expected"\ - } - - # ' = ' portion of keyval - dict set stateMatrix\ - keyval-syntax {\ - whitespace "keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "keyval-value-expected"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #### - dict set stateMatrix\ - keyval-value-expected {\ - whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-value-expected starttok {squote_seq "'"}}\ - startquote {TOSTATE "string-state" returnstate keyval-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - startsquote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate keyval-process-leading-squotes starttok {squote_seq "'"}} - dict set stateMatrix\ - leading-squote-space {\ - squote_seq "POPSPACE"\ - } - #dict set stateMatrix\ - # keyval-process-leading-squotes {\ - # startsquote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - # } - - dict set stateMatrix\ - keyval-tail {\ - whitespace "keyval-tail"\ - newline "POPSPACE"\ - comment "keyval-tail"\ - eof "end-state"\ - } - - dict set stateMatrix\ - itable-keyval-syntax {\ - whitespace "itable-keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "itable-keyval-value-expected"\ - newline "err-state"\ - eof "err-state"\ - } - dict set stateMatrix\ - itable-keyval-value-expected {\ - whitespace "itable-keyval-value-expected"\ - untyped_value {TOSTATE "itable-val-tail" note ""}\ - squote_seq_begin {PUSHSPACE "leading-squote-space" returnstate itable-keyval-value-expected starttok {squote_seq "'"}}\ - startquote {TOSTATE "string-state" returnstate itable-val-tail}\ - startmultiquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - startsquote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ - startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ - startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ - } - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } - - dict set stateMatrix\ - itable-val-tail {\ - whitespace "itable-val-tail"\ - endinlinetable "POPSPACE"\ - comma "POPSPACE"\ - XXXnewline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - newline "POPSPACE"\ - comment "itable-val-tail"\ - eof "err-state"\ - } - #dict set stateMatrix\ - # itable-quoted-key {\ - # whitespace "NA"\ - # itablequotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endquote "itable-keyval-syntax"\ - # } - #dict set stateMatrix\ - # itable-squoted-key {\ - # whitespace "NA"\ - # itablesquotedkey {PUSHSPACE "itable-keyval-space"}\ - # newline "err-state"\ - # endsquote "itable-keyval-syntax"\ - # } - - - - - - dict set stateMatrix\ - value-expected {\ - whitespace "value-expected"\ - untyped_value {"SAMESPACE" "" replay untyped_value}\ - startquote "string-state"\ - startsquote "literal-state"\ - startmultiquote {PUSHSPACE "multistring-space"}\ - triple_squote {PUSHSPACE "multiliteral-space"}\ - startinlinetable {PUSHSPACE itable-space}\ - startarray {PUSHSPACE array-space}\ - comment "err-state-value-expected-got-comment"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] - #it is for keyval ie x.y.z = value - - #this is the state after dot - #we are expecting a complete key token or whitespace - #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) - dict set stateMatrix\ - dottedkey-space {\ - whitespace "dottedkey-space"\ - dotsep "err-state"\ - barekey "dottedkey-space-tail"\ - squotedkey "dottedkey-space-tail"\ - dquotedkey "dottedkey-space-tail"\ - newline "err-state"\ - comma "err-state"\ - comment "err-state"\ - equal "err-state"\ - } - #dottedkeyend "POPSPACE" - #equal "POPSPACE"\ - - #jmn 2025 - #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop - dict set stateMatrix\ - dottedkey-space-tail {\ - whitespace "dottedkey-space-tail" - dotsep "dottedkey-space" - equal "POPSPACE"\ - } - - #-------------------------------------------------------------------------- - #scratch area - #from_toml {x=1} - # barekey tok - # table-space PUSHSPACE keyval-space state keyval-syntax - # - - - #-------------------------------------------------------------------------- - - - #REVIEW - #toml spec looks like heading towards allowing newlines within inline tables - #https://github.com/toml-lang/toml/issues/781 - - #2025 - appears to be valid for 1.1 - which we are targeting. - #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table - - #JMN2025 - #dict set stateMatrix\ - # curly-syntax {\ - # whitespace "curly-syntax"\ - # newline "curly-syntax"\ - # barekey {PUSHSPACE "itable-keyval-space"}\ - # itablequotedkey "itable-keyval-space"\ - # endinlinetable "POPSPACE"\ - # startquote "itable-quoted-key"\ - # comma "itable-space"\ - # comment "itable-space"\ - # eof "err-state"\ - # } - #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES - #We currently allow multiline ITABLES (also with comments) in the tokenizer. - #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? - - - #JMN REVIEW - dict set stateMatrix\ - array-space {\ - whitespace "array-space"\ - newline "array-space"\ - untyped_value "SAMESPACE"\ - startarray {PUSHSPACE "array-space"}\ - endarray "POPSPACE"\ - startmultiquote {PUSHSPACE multistring-space}\ - startinlinetable {PUSHSPACE itable-space}\ - startquote "string-state"\ - startsquote "literal-state"\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ - comma "array-space"\ - comment "array-space"\ - eof "err-state-array-space-got-eof"\ - } - dict set stateMatrix\ - array-syntax {\ - whitespace "array-syntax"\ - newline "array-syntax"\ - untyped_value "SAMESPACE"\ - startarray {PUSHSPACE array-space}\ - endarray "POPSPACE"\ - startmultiquote {PUSHSPACE multistring-space}\ - startquote "string-state"\ - startsquote "literal-state"\ - comma "array-space"\ - comment "err-state"\ - } - - - - #quoted-key & squoted-key need to PUSHSPACE from own token to keyval-space - dict set stateMatrix\ - quoted-key {\ - whitespace "NA"\ - dquotedkey {PUSHSPACE "keyval-space"}\ - newline "err-state"\ - endquote "keyval-syntax"\ - } - - - #review - dict set stateMatrix\ - dquoted-key {\ - whitespace "NA"\ - dquotedkey "dquoted-key"\ - newline "err-state"\ - } - dict set stateMatrix\ - squoted-key {\ - whitespace "NA"\ - squotedkey "squoted-key"\ - newline "err-state"\ - } - # endsquote {PUSHSPACE "keyval-space"} - - dict set stateMatrix\ - string-state {\ - whitespace "NA"\ - string "string-state"\ - endquote "SAMESPACE"\ - newline "err-state"\ - eof "err-state"\ - } - dict set stateMatrix\ - literal-state {\ - whitespace "NA"\ - literal "literal-state"\ - endsquote "SAMESPACE"\ - newline "err-state"\ - eof "err-state"\ - } - - - #dict set stateMatrix\ - # stringpart {\ - # continuation "SAMESPACE"\ - # endmultiquote "POPSPACE"\ - # eof "err-state"\ - # } - dict set stateMatrix\ - multistring-space {\ - whitespace "multistring-space"\ - continuation "multistring-space"\ - stringpart "multistring-space"\ - newline "multistring-space"\ - endmultiquote "POPSPACE"\ - eof "err-state"\ - } - - - #only valid subparts are literalpart and newline. other whitespace etc is within literalpart - #todo - treat sole cr as part of literalpart but crlf and lf as newline - dict set stateMatrix\ - multiliteral-space {\ - literalpart "multiliteral-space"\ - newline "multiliteral-space"\ - squote_seq_begin {PUSHSPACE "trailing-squote-space" returnstate multiliteral-space starttok {squote_seq "'"}}\ - triple_squote {POPSPACE note "on popping - we do any necessary concatenation of LITERALPART items due to squote processing"}\ - double_squote {TOSTATE multiliteral-space note "short squote_seq: can occur anywhere in the space e.g emitted at end when 5 squotes occur"}\ - startsquote {TOSTATE multiliteral-space note "short squote_seq: same as double_squote - false alarm"}\ - eof "err-premature-eof-in-multiliteral-space"\ - } - - #trailing because we are looking for possible terminating ''' - but must accept '''' or ''''' and re-integrate the 1st one or 2 extra squotes - dict set stateMatrix\ - trailing-squote-space {\ - squote_seq "POPSPACE"\ - } - - - dict set stateMatrix\ - tablename-state {\ - whitespace "NA"\ - tablename {zeropoppushspace table-space}\ - tablename2 {PUSHSPACE table-space}\ - endtablename "tablename-tail"\ - comma "err-state"\ - newline "err-state"\ - } - dict set stateMatrix\ - tablearrayname-state {\ - whitespace "NA"\ - tablearrayname {zeropoppushspace table-space}\ - tablearrayname2 {PUSHSPACE table-space}\ - endtablearray "tablearrayname-tail"\ - comma "err-state"\ - newline "err-state"\ - } - - dict set stateMatrix\ - tablename-tail {\ - whitespace "tablename-tail"\ - newline "table-space"\ - comment "tablename-tail"\ - eof "end-state"\ - } - dict set stateMatrix\ - tablearrayname-tail {\ - whitespace "tablearrayname-tail"\ - newline "table-space"\ - comment "tablearrayname-tail"\ - eof "end-state"\ - } - dict set stateMatrix\ - end-state {} - - set knowntokens [list] - set knownstates [list] - dict for {state transitions} $stateMatrix { - if {$state ni $knownstates} {lappend knownstates $state} - dict for {tok instructions} $transitions { - if {$tok ni $knowntokens} {lappend knowntokens $tok} - } - } - dict set stateMatrix nostate {} - foreach tok $knowntokens { - dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" - } - - - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #purpose - debugging? remove? - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #build a list of 'push triggers' from the stateMatrix - # ie tokens which can push a new space onto spacestack - set push_trigger_tokens [list] - tcl::dict::for {s transitions} $stateMatrix { - tcl::dict::for {token transition_to} $transitions { - set instruction [lindex $transition_to 0] - switch -exact -- $instruction { - PUSHSPACE - zeropoppushspace { - if {$token ni $push_trigger_tokens} { - lappend push_trigger_tokens $token - } - } - } - } - } - ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - - - - #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) - #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE - - #mainly for the -space states: - #redirect to another state $c based on a state transition from $whatever to $b - # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. - #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' - - #Push to, next - #default first states when we push to these spaces - variable spacePushTransitions { - keyval-space keyval-syntax - itable-keyval-space itable-keyval-syntax - array-space array-space - table-space tablename-state - } - #itable-space itable-space - #Pop to, next - variable spacePopTransitions { - array-space array-syntax - } - #itable-space curly-syntax - #itable-keyval-space itable-val-tail - #review - #we pop to keyval-space from dottedkey-space or from value-expected? we don't always want to go to keyval-tail - #leave it out and make the POPSPACE caller explicitly specify it - #keyval-space keyval-tail - - variable spaceSameTransitions { - array-space array-syntax - } - #itable-space curly-syntax - #itable-keyval-space itable-val-tail - - - variable state_list ;#reset every tomlish::decode::toml - - namespace export tomlish toml - namespace ensemble create - - #goNextState has various side-effects e.g pushes and pops spacestack - #REVIEW - setting nest and v elements here is ugly - #todo - make neater, more single-purpose? - proc goNextState {tokentype tok currentstate} { - variable state - variable nest - variable v - - set prevstate $currentstate - - - variable spacePopTransitions - variable spacePushTransitions - variable spaceSameTransitions - - variable last_space_action "none" - variable last_space_type "none" - variable state_list - - set result "" - set starttok "" - - if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { - set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] - ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" - switch -exact -- [lindex $transition_to 0] { - POPSPACE { - spacestack pop - set parent_info [spacestack peek] - set type [dict get $parent_info type] - set parentspace [dict get $parent_info state] - - set last_space_action "pop" - set last_space_type $type - - if {[dict exists $parent_info returnstate]} { - set next [dict get $parent_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected to stored returnstate $next <<---" - } else { - ### - #review - do away with spacePopTransitions - which although useful to provide a default.. - # - involve error-prone configurations distant to the main state transition configuration in stateMatrix - if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { - set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" - } else { - set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition to parent space $parentspace<<---" - } - } - set result $next - } - SAMESPACE { - set currentspace_info [spacestack peek] - ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" - set type [dict get $currentspace_info type] - set currentspace [dict get $currentspace_info state] - - if {[dict exists $currentspace_info returnstate]} { - set next [dict get $currentspace_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" - } else { - if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { - set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" - } else { - set next $currentspace - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" - } - } - set result $next - } - zeropoppushspace { - if {$nest > 0} { - #pop back down to the root level (table-space) - spacestack pop - set parentinfo [spacestack peek] - set type [dict get $parentinfo type] - set target [dict get $parentinfo state] - - set last_space_action "pop" - set last_space_type $type - - #----- - #standard pop - set parentlevel [expr {$nest -1}] - lappend v($parentlevel) [set v($nest)] - incr nest -1 - #----- - } - #re-entrancy - - #set next [list PUSHSPACE [lindex $transition_to 1]] - set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 - #::tomlish::log::notice "goNextState REENTRANCY. calling goNextState $nexttokentype $tokentype" - #set result [::tomlish::parse::goNextState $nexttokentype $tokentype] - ::tomlish::log::debug "--->> zeropoppushspace goNextState REENTRANCY. calling goNextState $nexttokentype $currentstate" - set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] - set result [dict get $transition_info newstate] - } - PUSHSPACE { - set original_target [dict get $transition_to PUSHSPACE] - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - if {[dict exists $transition_to starttok]} { - set starttok [dict get $transition_to starttok] - } - spacestack push [dict create type space state $original_target] - - set last_space_action "push" - set last_space_type "space" - - if {[dict exists $transition_to state]} { - #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) - set next [dict get $transition_to state] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" - } else { - #puts $::tomlish::parse::spacePushTransitions - if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { - set next [dict get $::tomlish::parse::spacePushTransitions $original_target] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " - } else { - set next $original_target - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" - } - } - set result $next - } - TOSTATE { - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - set result [dict get $transition_to TOSTATE] - } - default { - #simplified version of TOSTATE - set result [lindex $transition_to 0] ;#ignore everything but first word - } - } - } else { - ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" - set result "nostate" - } - lappend state_list [list tokentype $tokentype from $currentstate to $result] - set state $result - ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " - return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] - } - - proc report_line {{line ""}} { - variable linenum - variable is_parsing - if {$is_parsing} { - if {$line eq ""} { - set line $linenum - } - return "Line Number: $line" - } else { - #not in the middle of parsing tomlish text - return nothing. - return "" - } - } - - #produce a *slightly* more readable string rep of the nest for puts etc. - proc nest_pretty1 {list} { - set prettier "{" - - foreach el $list { - if { [lindex $el 0] eq "NEWLINE"} { - append prettier "[list $el]\n" - } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { - append prettier [nest_pretty1 $el] - } else { - append prettier "[list $el] " - } - } - append prettier "}" - return $prettier - } - - proc set_tokenType {t} { - variable tokenType - variable tokenType_list - if {![info exists tokenType]} { - set tokenType "" - } - lappend tokenType_list $t - set tokenType $t - } - - proc switch_tokenType {t} { - variable tokenType - variable tokenType_list - lset tokenType_list end $t - set tokenType $t - } - - proc get_tokenType {} { - variable tokenType - return $tokenType - } - - proc _shortcircuit_startquotesequence {} { - variable tok - variable i - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - set_tokenType "startquote" - incr i -1 - return -level 2 1 - } elseif {$toklen == 2} { - puts stderr "_shortcircuit_startquotesequence toklen 2" - set_tokenType "startquote" - set tok "\"" - incr i -2 - return -level 2 1 - } - } - - proc get_token_waiting {} { - variable token_waiting - return [lindex $token_waiting 0] - } - proc clear_token_waiting {} { - variable token_waiting - set token_waiting [list] - } - - #token_waiting is a list - but our standard case is to have only one - #in certain circumstances such as near eof we may have 2 - #the set_token_waiting function only allows setting when there is not already one waiting. - #we want to catch cases of inadvertently trying to set multiple - # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. - proc set_token_waiting {args} { - if {[llength $args] %2 != 0} { - error "tomlish set_token_waiting must have args of form: type value complete 0|1" - } - variable token_waiting - - if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { - #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another - #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context - #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it - set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" - append err \n " - cannot add token_waiting: $args" - error $err - #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] - #set token_waiting [list] - } - - set waiting [dict create] - dict for {k v} $args { - switch -exact $k { - type - complete { - dict set waiting $k $v - } - value { - dict set waiting tok $v - } - startindex { - dict set waiting startindex $v - } - default { - error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" - } - } - } - if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { - error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" - } - if {![llength $token_waiting]} { - set token_waiting [list $waiting] - } else { - #an extra sanity-check that we don't have more than just the eof.. - if {[llength $token_waiting] > 1} { - set err "tomlish Unexpected. Existing token_waiting count > 1.\n" - foreach tw $token_waiting { - append err " $tw" \n - } - append err " - cannot add token_waiting: $waiting" - error $err - } - #last entry must be a waiting eof - set token_waiting [list $waiting [lindex $token_waiting end]] - } - return - } - - #returns 0 or 1 - #tomlish::parse::tok - #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag - # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) - # - interactive use? - - proc tok {s} { - variable nest - variable v - variable i - variable tok - variable type ;#character type - variable state ;#FSM - - - variable tokenType - variable tokenType_list - - - variable endToken - - variable lastChar - - variable braceCount - variable bracketCount - - - #------------------------------ - #Previous run found another (presumably single-char) token - #The normal case is for there to be only one dict in the list - #multiple is an exception - primarily for eof - variable token_waiting - if {[llength $token_waiting]} { - set waiting [lindex $token_waiting 0] - - set tokenType [dict get $waiting type] - set tok [dict get $waiting tok] - #todo: dict get $token_waiting complete - set token_waiting [lrange $token_waiting 1 end] - return 1 - } - #------------------------------ - - set resultlist [list] - set sLen [tcl::string::length $s] - - set slash_active 0 - set quote 0 - set c "" - set multi_dquote "" - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $s [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $s $i] - set cindex $i - set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] - tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" - #puts "got char $c during tokenType '$tokenType'" - incr i ;#must incr here because we do returns inside the loop - - switch -exact -- $ctest { - # { - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" - } - whitespace { - # hash marks end of whitespace token - #do a return for the whitespace, set token_waiting - #set_token_waiting type comment value "" complete 1 - incr i -1 ;#leave comment for next run - return 1 - } - untyped_value { - #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? - #we will accept a comment marker as an immediate terminator of the untyped_value. - incr i -1 - return 1 - } - starttablename - starttablearrayname { - #fix! - error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out - append tok $c - } - default { - #dquotedkey, itablequotedkey, string,literal, multistring - append tok $c - } - } - } else { - switch -- $state { - multistring-space { - set_tokenType stringpart - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes#" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "#" - } - default { - #start of token if we're not in a token - set_tokenType comment - set tok "" ;#The hash is not part of the comment data - } - } - } - } - lc { - #left curly brace - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename - starttablearrayname { - #*bare* tablename can only contain letters,digits underscores - error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #valid in quoted parts - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - switch -exact -- $state { - itable-keyval-value-expected - keyval-value-expected - value-expected { - #switch last key to tablename?? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - array-space - array-syntax { - #nested anonymous inline table - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\{" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\{" - } - default { - error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" - } - } - } - - } - rc { - #right curly brace - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename - tablename { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endinlinetable value "" complete 1 startindex $cindex - return 1 - } - starttablearrayname - tablearrayname { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - default { - #end any other token - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - value-expected { - #invalid - but allow parser statemachine to report it. - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-space { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname-state problem" - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - #invalid - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - XXXcurly-syntax { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-val-tail { - set_tokenType "endinlinetable" - set tok "" - #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 - incr i -1 - return 1 - } - itable-keyval-syntax { - error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\}" - } - multiliteral-space { - set_tokenType "literalpart" ; #review - set tok "\}" - } - default { - #JMN2024b keyval-tail? - error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" - } - } - } - - } - lb { - #left square bracket - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - starttablename { - #change the tokenType - switch_tokenType "starttablearrayname" - set tok "" ;#no output into the tomlish list for this token - #any following whitespace is part of the tablearrayname, so return now - return 1 - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - #append tok "\\[" - append tok {\[} - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - #invalid at this point - state machine should disallow table -> starttablearrayname - set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "\[" - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - keyval-value-expected - itable-keyval-value-expected - value-expected { - set_tokenType "startarray" - set tok "\[" - return 1 - } - table-space { - #table name - #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray - #note that a starttablearrayname token may contain whitespace between the brackets - # e.g \[ \[ - set_tokenType "starttablename" - set tok "" ;#there is no output into the tomlish list for this token - } - array-space - array-syntax { - #nested array? - set_tokenType "startarray" - set tok "\[" - return 1 - #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\[" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\[" - } - itable-space { - #handle state just to give specific error msg - error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" - } - default { - error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - rb { - #right square bracket - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - #???? - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } else { - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablename value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - tablearraynames { - #todo? - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - default { - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - value-expected { - #invalid - but allow parser statemachine to report it. - set_tokenType "endarray" - set tok "\]" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endarray" - set tok "\]" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endtablename" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname problem" - set_tokenType "endtablearray" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - set_tokenType "endarray" - set tok "\]" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "$dquotes\]" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\]" - } - default { - error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - bsl { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - #backslash - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - #end whitespace token - incr i -1 ;#reprocess bsl in next run - return 1 - } else { - error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" - } - } - literal - literalpart - squotedkey { - #never need to set slash_active true when in single quoted tokens - append tok "\\" - set slash_active 0 - } - XXXitablesquotedkey { - } - string - dquotedkey - itablequotedkey - comment { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - stringpart { - if {$slash_active} { - #assert - quotes empty - or we wouldn't have slash_active - set slash_active 0 - append tok "\\\\" - } else { - append tok $dquotes - set slash_active 1 - } - } - starttablename - starttablearrayname { - error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" - } - tablename - tablearrayname { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - barekey { - error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" - } - default { - error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$slash_active} { - set_tokenType "stringpart" - set tok "\\\\" - set slash_active 0 - } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - } - set slash_active 1 - } - } - multiliteral-space { - #nothing can be escaped in multiliteral-space - not even squotes (?) review - set_tokenType "literalpart" - set tok "\\" - } - default { - error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" - } - } - } - } - sq { - #single quote - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - #short squote_seq tokens are returned if active during any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - switch -- $state { - leading-squote-space { - append tok $c - if {$existingtoklen > 2} { - error "tomlish tok error: squote_seq unexpected length $existingtoklen when another received" - } elseif {$existingtoklen == 2} { - return 1 ;#return tok ''' - } - } - trailing-squote-space { - append tok $c - if {$existingtoklen == 4} { - #maxlen to be an squote_seq is multisquote + 2 = 5 - #return tok ''''' - return 1 - } - } - default { - error "tomlish tok error: squote_seq in unexpected state '$state' - expected leading-squote-space or trailing-squote-space" - } - } - } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - #temp token creatable only during value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - append tok $c - } - 2 { - #switch? - append tok $c - set_tokenType triple_squote - return 1 - } - default { - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" - } - } - } - literal { - #slash_active always false - #terminate the literal - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - literalpart { - #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) - #todo - # idea: end this literalpart (possibly 'temporarily') - # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack - # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to an squote_seq token for later processing - return 1 - } - XXXitablesquotedkey { - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - squotedkey { - ### - #set_token_waiting type endsquote value "'" complete 1 - return 1 - } - starttablename - starttablearrayname { - #!!! - incr i -1 - return 1 - } - tablename - tablearrayname { - append tok $c - } - barekey { - #not clear why o'shennanigan shouldn't be a legal barekey - but it seems not to be. - error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" - } - default { - append tok $c - } - } - } else { - switch -exact -- $state { - value-expected - array-space { - set_tokenType "_start_squote_sequence" - set tok "'" - } - itable-keyval-value-expected - keyval-value-expected { - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - table-space { - #tests: squotedkey.test - set_tokenType "squotedkey" - set tok "" - } - itable-space { - #tests: squotedkey_itable.test - set_tokenType "squotedkey" - set tok "" - } - XXXitable-space { - #future - could there be multiline keys? - #this would allow arbitrary tcl dicts to be stored in toml - #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - tablename-state { - #first char in tablename-state/tablearrayname-state - set_tokenType tablename - append tok "'" - } - tablearrayname-state { - set_tokenType tablearrayname - append tok "'" - } - literal-state { - tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType literal - incr -1 - return 1 - } - multistring-space { - error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up an squote_seq to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "squote_seq_begin" - set tok "'" - return 1 - } - dottedkey-space { - set_tokenType squotedkey - } - default { - error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" - } - } - } - - } - dq { - #double quote - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - append tok $c - } elseif {$toklen == 2} { - append tok $c - #switch vs set? - set_tokenType "startmultiquote" - return 1 - } else { - error "tomlish unexpected token length $toklen in 'startquotesequence'" - } - } - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - set_tokenType "startsquote" - incr i -1 - return 1 - } - 2 { - set_tokenType "startsquote" - incr i -2 - return 1 - } - default { - error "tomlish unexpected _start_squote_sequence length $toklen" - } - } - } - literal - literalpart { - append tok $c - } - string { - if {$had_slash} { - append tok "\\" $c - } else { - #unescaped quote always terminates a string? - set_token_waiting type endquote value "\"" complete 1 startindex $cindex - return 1 - } - } - stringpart { - #sub element of multistring - if {$had_slash} { - append tok "\\" $c - } else { - #incr i -1 - - if {$multi_dquote eq "\"\""} { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex -2}] - set multi_dquote "" - return 1 - } else { - append multi_dquote "\"" - } - } - } - whitespace { - switch -exact -- $state { - multistring-space { - #REVIEW - if {$had_slash} { - incr i -2 - return 1 - } else { - switch -- [tcl::string::length $multi_dquote] { - 2 { - set_token_waiting type endmultiquote value "\"\"\"" complete 1 startindex [expr {$cindex-2}] - set multi_dquote "" - return 1 - } - 1 { - incr i -2 - return 1 - } - 0 { - incr i -1 - return 1 - } - } - } - } - keyval-value-expected - value-expected { - #end whitespace token and reprocess - incr i -1 - return 1 - - #if {$multi_dquote eq "\"\""} { - # set_token_waiting type startmultiquote value "\"\"\"" complete 1 - # set multi_dquote "" - # return 1 - #} else { - # #end whitespace token and reprocess - # incr i -1 - # return 1 - #} - } - table-space - itable-space { - incr i -1 - return 1 - } - default { - set_token_waiting type startquote value "\"" complete 1 startindex $cindex - return 1 - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - XXXdquotedkey - XXXitablequotedkey { - if {$had_slash} { - append tok "\\" - append tok $c - } else { - set_token_waiting type endquote value "\"" complete 1 startindex $cindex - return 1 - } - } - dquotedkey { - ### - if {$had_slash} { - append tok "\\" - append tok $c - } else { - #set_token_waiting type endsquote value "'" complete 1 - return 1 - } - } - squotedkey { - append tok $c - } - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - incr i -1 ;## - return 1 - } - default { - error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - #$slash_active not relevant when no tokenType - #token is string only if we're expecting a value at this point - switch -exact -- $state { - keyval-value-expected - value-expected - array-space { - #!? start looking for possible multistartquote - #set_tokenType startquote - #set tok $c - #return 1 - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - itable-keyval-value-expected { - #JMN 2025 - review - set_tokenType "startquotesequence" ;#one or more quotes in a row - either startquote or multistartquote - set tok $c - } - multistring-space { - #TODO - had_slash!!! - #REVIEW - if {$had_slash} { - set_tokenType "stringpart" - set tok "\\\"" - set multi_dquote "" - } else { - if {$multi_dquote eq "\"\""} { - tomlish::log::debug "- tokloop char dq ---> endmultiquote" - set_tokenType "endmultiquote" - set tok "\"\"\"" - return 1 - #set_token_waiting type endmultiquote value "\"\"\"" complete 1 - #set multi_dquote "" - #return 1 - } else { - append multi_dquote "\"" - } - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\"" - } - XXXtable-space { - set_tokenType "startquote" - set tok $c - return 1 - } - XXXitable-space { - set_tokenType "startquote" - set tok $c - } - table-space - itable-space { - set_tokenType "dquotedkey" - set tok "" - } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - dottedkey-space { - set_tokenType dquotedkey - set tok "" - - #only if complex keys become a thing - #set_tokenType dquote_seq_begin - #set tok $c - } - default { - error "tomlish Unexpected quote during state '$state' [tomlish::parse::report_line]" - } - } - } - } - = { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart - squotedkey { - #assertion had_slash 0, multi_dquote "" - append tok $c - } - string - comment - dquotedkey - itablequotedkey { - #for these tokenTypes an = is just data. - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - whitespace { - if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - set_token_waiting type equal value = complete 1 startindex $cindex - return 1 - } - } - barekey { - #set_token_waiting type equal value = complete 1 - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out - append tok $c - } - default { - error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok ${dquotes}= - } - multiliteral-space { - set_tokenType "literalpart" - set tok "=" - } - dottedkey-space { - set_tokenType "equal" - set tok "=" - return 1 - } - default { - set_tokenType "equal" - set tok = - return 1 - } - } - } - } - cr { - #REVIEW! - set dquotes $multi_dquote - set multi_dquote "" ;#!! - # \r carriage return - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #we have received a double cr - ::tomlish::log::warn "double cr - will generate cr token. needs testing" - set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal { - append tok $c - } - literalpart { - #part of MLL string (multi-line literal string) - #we need to split out crlf as a separate NEWLINE to be consistent - ::tomlish::log::warn "literalpart ended by cr - needs testing" - #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space - incr i -1 - return 1 - } - stringpart { - #stringpart is a part of MLB string (multi-line basic string) - #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #could in theory be valid in quoted part of name - #review - might be better just to disallow here - append tok $c - } - whitespace { - #it should technically be part of whitespace if not followed by lf - #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW - incr i -1 - return 1 - } - untyped_value { - incr i -1 - return 1 - } - default { - #!todo - error out if cr inappropriate for tokenType - append tok $c - } - } - } else { - #lf may be appended if next - #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) - set_tokenType "newline" - set tok cr - } - } - lf { - # \n newline - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #review - #this lf is the trailing part of a crlf - append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal { - #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' - #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - literalpart { - #we allow newlines - but store them within the multiliteral as their own element - #This is a legitimate end to the literalpart - but not the whole multiliteral - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - stringpart { - if {$dquotes ne ""} { - append tok $dquotes - incr i -1 - return 1 - } else { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } - starttablename - tablename - tablearrayname - starttablearrayname { - error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" - } - default { - #newline ends all other tokens. - #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) - #note for whitespace: - # we will use the convention that \n terminates the current whitespace even if whitespace follows - # ie whitespace is split into separate whitespace tokens at each newline - - #puts "-------------- newline lf during tokenType $tokenType" - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - #e.g one or 2 quotes just before nl - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } - set_tokenType "newline" - set tok lf - return 1 - } - } - multiliteral-space { - #assert had_slash 0, multi_dquote "" - set_tokenType "newline" - set tok "lf" - return 1 - } - default { - #ignore slash? error? - set_tokenType "newline" - set tok lf - return 1 - } - } - #if {$had_slash} { - # #CONT directly before newline - allows strings_5_byteequivalent test to pass - # set_tokenType "continuation" - # set tok "\\" - # incr i -1 - # return 1 - #} else { - # set_tokenType newline - # set tok lf - # return 1 - #} - } - } - , { - set dquotes $multi_dquote - set multi_dquote "" - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - comment - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok , - } - string - dquotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - #stringpart can have up to 2 quotes too - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - set_token_waiting type comma value "," complete 1 startindex $cindex - return 1 - } - } - default { - set_token_waiting type comma value "," complete 1 startindex $cindex - if {$had_slash} {append tok "\\"} - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "$dquotes," - } - multiliteral-space { - #assert had_slash 0, multi_dquote "" - set_tokenType "literalpart" - set tok "," - } - default { - set_tokenType "comma" - set tok "," - return 1 - } - } - } - } - . { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - comment - untyped_value { - if {$had_slash} {append tok "\\"} - append tok $c - } - string - dquotedkey - itablequotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $dquotes$c - } - literal - literalpart - squotedkey { - #assert had_slash always 0, multi_dquote "" - append tok $c - } - whitespace { - switch -exact -- $state { - multistring-space { - set backchars [expr {[tcl::string::length $dquotes] + 1}] - if {$had_slash} { - incr backchars 1 - } - incr i -$backchars - return 1 - } - xxxdottedkey-space { - incr i -1 - return 1 - } - dottedkey-space-tail { - incr i -1 - return 1 - } - default { - error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" - } - } - } - starttablename - starttablearrayname { - #This would correspond to an empty table name - error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #subtable - split later - review - append tok $c - } - barekey { - #e.g x.y = 1 - #we need to transition the barekey to become a structured table name ??? review - #x is the tablename y is the key - set_token_waiting type dotsep value "." complete 1 startindex $cindex - return 1 - } - default { - error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" - #set_token_waiting type period value . complete 1 - #return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "$dquotes." - } - multiliteral-space { - set_tokenType "literalpart" - set tok "." - } - XXXdottedkey-space { - ### obs? - set_tokenType "dotsep" - set tok "." - return 1 - } - dottedkey-space-tail { - ### - set_tokenType "dotsep" - set tok "." - return 1 - } - default { - set_tokenType "untyped_value" - set tok "." - } - } - } - - } - " " { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - if {[tcl::string::length $tokenType]} { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - #todo had_slash - emit token or error - #whitespace is a terminator for bare keys - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - comment { - if {$had_slash} { - append tok "\\" - } - append tok $dquotes$c - } - string - dquotedkey - itablequotedkey { - if {$had_slash} { append tok "\\" } - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - append tok $dquotes - incr i -1 - return 1 - } - } - literal - literalpart - squotedkey { - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - #end whitespace token - #go back by the number of quotes plus this space char - set backchars [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backchars - return 1 - } else { - append tok $c - } - } else { - append tok $c - } - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - set_tokenType "stringpart" - set tok $dquotes - incr i -1 - return 1 - } - set_tokenType "whitespace" - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - if {$had_slash} { - error "tomlish unexpected backslash [tomlish::parse::report_line]" - } - set_tokenType "whitespace" - append tok $c - } - } - } - } - tab { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - barekey { - #whitespace is a terminator for bare keys - incr i -1 - #set_token_waiting type whitespace value $c complete 1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - squotedkey { - append tok $c - } - dquotedkey - string - comment - whitespace { - #REVIEW - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - append tok $dquotes - incr i -1 - return 1 - } - } - literal - literalpart { - append tok $c - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearraynames { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - if {$slash_active} { - set slash_active 0 - } - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - if {$dquotes ne ""} { - set_tokenType stringpart - set tok $dquotes - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "whitespace" - append tok $c - } - } - } - } - bom { - #BOM (Byte Order Mark) - ignored by token consumer - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - _start_squote_sequence { - #assert - tok will be one or two squotes only - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - literal - literalpart { - append tok $c - } - default { - set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "bom" - set tok "\uFEFF" - return 1 - } - } - } - } - default { - set dquotes $multi_dquote - set multi_dquote "" ;#!! - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - squote_seq { - incr i -1 - return 1 - } - startquotesequence { - _shortcircuit_startquotesequence - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "startsquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - if {$dquotes ne ""} { - set backlen [expr {[tcl::string::length $dquotes] + 1}] - incr i -$backlen - return 1 - } else { - incr i -1 - return 1 - } - } else { - #review - incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. - return 1 - } - } - barekey { - if {[tomlish::utils::is_barekey $c]} { - append tok $c - } else { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" - } - } - starttablename - starttablearrayname { - incr i -1 - #allow statemachine to set context for subsequent chars - return 1 - } - stringpart { - append tok $dquotes$c - } - default { - #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname - append tok $c - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - table-space - itable-space { - #if no currently active token - assume another key value pair - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - XXXcurly-syntax { - puts stderr "curly-syntax - review" - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - multistring-space { - set_tokenType "stringpart" - if {$had_slash} { - #assert - we don't get had_slash and dquotes at same time - set tok \\$c - } else { - set tok $dquotes$c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - tablename-state { - set_tokenType "tablename" - set tok $c - } - tablearrayname-state { - set_tokenType "tablearrayname" - set tok $c - } - dottedkey-space { - set_tokenType barekey - set tok $c - } - default { - #todo - something like ansistring VIEW to show control chars? - set cshow [string map [list \t tab \v vt] $c] - tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" - set_tokenType "untyped_value" - set tok $c - } - } - } - } - } - - } - - #run out of characters (eof) - if {[tcl::string::length $tokenType]} { - #check for invalid ending tokens - #if {$state eq "err-state"} { - # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" - #} - switch -exact -- $tokenType { - startquotesequence { - set toklen [tcl::string::length $tok] - if {$toklen == 1} { - #invalid - #eof with open string - error "tomlish eof reached without closing quote for string. [tomlish::parse::report_line]" - } elseif {$toklen == 2} { - #valid - #we ended in a double quote, not actually a startquoteseqence - effectively an empty string - switch_tokenType "startquote" - incr i -1 - #set_token_waiting type string value "" complete 1 - return 1 - } - } - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open literal - error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" - } - 2 { - #review - set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] - set_tokenType "literal" - set tok "" - return 1 - } - } - } - newline { - #The only newline token that has still not been returned should have a tok value of "cr" - puts "tomlish eof reached - with incomplete newline token '$tok'" - if {$tok eq "cr"} { - #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. - #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) - #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values - # ie as it's own token. - switch_tokenType "cr" - return 1 - } else { - #should be unreachable - error "tomlish eof reached - with invalid newline token. value: $tok" - } - } - } - set_token_waiting type eof value eof complete 1 startindex $i ;#review - return 1 - } else { - ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" - set tokenType "eof" - set tok "eof" - } - return 0 - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] -} - -namespace eval tomlish::dict { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - - proc is_tomlish_typeval {d} { - #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} - #as a sanity check we need to avoid mistaking user data that happens to match same form - #consider x.y={type="spud",value="blah"} - #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. - #check the length of the type as a quick way to see it's a tag - not something else masqerading. - expr {[dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} - } - proc is_tomlish_typeval2 {d} { - upvar ::tomlish::tags tags - expr {[lindex $d 0] eq "type" && [lindex $d 1] in $tags} - } - proc last_tomltype_posn {d} { - set last_simple -1 - set dictposn [expr {[dict size $d] -1}] - foreach k [lreverse [dict keys $d]] { - set dval [dict get $d $k] - if {[is_tomlish_typeval $dval]} { - set last_simple $dictposn - break - } - incr dictposn -1 - } - return $last_simple - } - - - #review - proc name_from_tablestack {tablestack} { - set name "" - foreach tinfo [lrange $tablestack 1 end] { - lassign $tinfo type namepart - switch -- $type { - T { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - I { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - default { - #end at first break in the leading sequence of T & I tablenames - break - } - } - } - return $name - } - -} - -tcl::namespace::eval tomlish::app { - variable applist [list encoder decoder test] - - #*** !doctools - #[subsection {Namespace tomlish::app}] - #[para] - #[list_begin definitions] - - proc decoder {args} { - #*** !doctools - #[call app::[fun decoder] [arg args]] - #[para] read toml on stdin until EOF - #[para] on error - returns non-zero exit code and writes error on stderr - #[para] on success - returns zero exit code and writes JSON encoding of the data on stdout - #[para] This decoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - #fconfigure stdin -encoding utf-8 - fconfigure stdin -translation binary - #Just slurp it all - presumably we are not handling massive amounts of data on stdin. - # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. - if {[catch { - set toml [read stdin] - }]} { - exit 2 ;#read error - } - try { - set j [::tomlish::toml_to_json $toml] - } on error {em} { - puts stderr "decoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $j - exit 0 - } - - proc encoder {args} { - #*** !doctools - #[call app::[fun encoder] [arg args]] - #[para] read JSON on stdin until EOF - #[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation - #[para] return zero exitcode and TOML data on stdout if JSON data can be converted. - #[para] This encoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - fconfigure stdin -translation binary - if {[catch { - set json [read stdin] - }]} { - exit 2 ;#read error - } - try { - set toml [::tomlish::json_to_toml $json] - } on error {em} { - puts stderr "encoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $toml - exit 0 - } - - proc test {args} { - set opts [dict merge [dict create] $args] - - package require test::tomlish - if {[dict exists $opts -suite]} { - test::tomlish::suite [dict get $opts -suite] - } - test::tomlish::run - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::app ---}] -} - -proc ::tomlish::appnames {} { - set applist [list] - foreach cmd [info commands ::tomlish::app::*] { - lappend applist [namespace tail $cmd] - } - return $applist -} - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Secondary API namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish::lib { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - #*** !doctools - #[subsection {Namespace tomlish::lib}] - #[para] Secondary functions that are part of the API - #[list_begin definitions] - - #proc utility1 {p1 args} { - # #*** !doctools - # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 - #} - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -if {$argc > 0} { - puts stderr "argc: $argc args: $argv" - - if {($argc == 1)} { - if {[tcl::string::tolower $argv] in {help -help h -h}} { - puts stdout "Usage: -app where appname one of:[tomlish::appnames]" - exit 0 - } else { - puts stderr "Argument '$argv' not understood. Try -help" - exit 1 - } - } - set opts [dict create] - set opts [dict merge $opts $argv] - - set opts_understood [list -app ] - if {"-app" in [dict keys $opts]} { - #Don't vet the remaining opts - as they are interpreted by each app - } else { - foreach key [dict keys $opts] { - if {$key ni $opts_understood} { - puts stderr "Option '$key' not understood" - exit 1 - } - } - } - if {[dict exists $opts -app]} { - set app [dict get $opts -app] - if {$app ni [tomlish::appnames]} { - puts stderr "app '[dict get $opts -app]' not found" - exit 1 - } - tomlish::app::$app {*}$opts - } -} - -## Ready -package provide tomlish [namespace eval tomlish { - variable pkg tomlish - variable version - set version 1.1.3 -}] -return - -#*** !doctools -#[manpage_end] - diff --git a/src/vendormodules/tomlish-1.1.4.tm b/src/vendormodules/tomlish-1.1.4.tm deleted file mode 100644 index c472eace..00000000 --- a/src/vendormodules/tomlish-1.1.4.tm +++ /dev/null @@ -1,6801 +0,0 @@ -# -*- tcl -*- -# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt -# -# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. -# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# (C) 2024 -# -# @@ Meta Begin -# Application tomlish 1.1.4 -# Meta platform tcl -# Meta license -# @@ Meta End - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# doctools header -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -#*** !doctools -#[manpage_begin tomlish_module_tomlish 0 1.1.4] -#[copyright "2024"] -#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] -#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] -#[require tomlish] -#[keywords module parsing toml configuration] -#[description] -#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) -#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml -#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, -#[para] although these other formats are generally unlikely to retain whitespace or comments -#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. -#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. -#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions -#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key -#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) -#[para] will need a -type option (-force ?) to force overriding with another type such as an int. - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section Overview] -#[para] overview of tomlish -#[subsection Concepts] -#[para] - - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -## Requirements -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[subsection dependencies] -#[para] packages used by tomlish -#[list_begin itemized] - -package require Tcl 8.6- -package require struct::stack -package require logger - -#*** !doctools -#[item] [package {Tcl 8.6-}] -#[item] [package {struct::stack}] - -#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') - - -#*** !doctools -#[list_end] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section API] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Base namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish { - namespace export {[a-z]*}; # Convention: export all lowercase - variable types - - #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. - #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values - #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. - #todo - review - set existing_recursionlimit [interp recursionlimit {}] - if {$existing_recursionlimit < 5000} { - interp recursionlimit {} 5000 - } - - #IDEAS: - # since get_toml produces tomlish with whitespace/comments intact: - # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace - # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? - # - separate addKey?? - # - deleteKey (delete leaf) - # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) - # - set/add Table? - position in doc based on existing tables/subtables? - - #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. - #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n - #The newline is part of the keyval structure so makes reordering easier - #example from_toml "a=1\nb=2\n\n\n" - # 0 = TOMLISH - # 1 = KEY a = {INT 1} {NEWLINE lf} - # 2 = NEWLINE lf - # 3 = KEY b = {INT 2} {NEWLINE lf} - # 4 = NEWLINE lf - # 5 = NEWLINE lf - - #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, - # and duplicate table headers are allowed in that context. - #e.g - #[[fruits]] - # name="apple" - # [fruits.metadata] - # id=1 - # - #[unrelated1] - # - #[[fruits]] - # name="pear" - # - #[unrelated2] - # silly="ordering" - # - #[fruits.metadata] - #id=2 - #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. - #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, - # we would lose roundtripability toml->tomlish->toml - # ----------------------------------------------------- - #REVIEW - #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. - #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of to_dict - #The most practical way might be to use to_dict followed by from_dict - but that would lose comment info and formatting. - #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, - #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. - #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) - # ----------------------------------------------------- - - - - #ARRAY is analogous to a Tcl list - #TABLE is analogous to a Tcl dict - #WS = inline whitespace - #KEY = bare key and value - #DQKEY = double quoted key and value - #SQKEY = single quoted key and value - #ITABLE = inline table (*can* be anonymous table) - # inline table values immediately create a table with the opening brace - # inline tables are fully defined between their braces, as are dotted-key subtables defined within - # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained - - set tags [list TOMLISH ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] - #removed - ANONTABLE - #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) - #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) - #todo - configurable - allow empty string for 'unlimited' - set min_int -9223372036854775808 ;#-2^63 - set max_int +9223372036854775807 ;#2^63-1 - - proc Dolog {lvl txt} { - #return "$lvl -- $txt" - set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" - puts stderr $msg - } - logger::initNamespace ::tomlish - foreach lvl [logger::levels] { - interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl - log::logproc $lvl tomlish_log_$lvl - } - - #*** !doctools - #[subsection {Namespace tomlish}] - #[para] Core API functions for tomlish - #[list_begin definitions] - - proc tags {} { - return $::tomlish::tags - } - - #helper function for to_dict - proc _get_keyval_value {keyval_element} { - log::notice ">>> _get_keyval_value from '$keyval_element'<<<" - set found_value 0 - #find the value - # 3 is the earliest index at which the value could occur (depending on whitespace) - set found_sub [list] - if {[lindex $keyval_element 2] ne "="} { - error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list" - } - - #review - if {[uplevel 1 [list info exists tablenames_info]]} { - upvar tablenames_info tablenames_info - } else { - set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc - } - - foreach sub [lrange $keyval_element 2 end] { - #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey - switch -exact -- [lindex $sub 0] { - STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { - set type [lindex $sub 0] - set value [lindex $sub 1] - set found_sub $sub - incr found_value 1 - } - default {} - } - } - if {!$found_value} { - error "tomlish Failed to find value element in KEY. '$keyval_element'" - } - if {$found_value > 1} { - error "tomlish Found multiple value elements in KEY, expected exactly one. '$keyval_element'" - } - - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - #simple (non-container, no-substitution) datatype - set result [list type $type value $value] - } - STRING - STRINGPART { - set result [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL - LITERALPART { - #REVIEW - set result [list type $type value $value] - } - TABLE { - #invalid? - error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" - } - ITABLE { - #This one should not be returned as a type value structure! - # - set result [::tomlish::to_dict [list $found_sub]] - } - ARRAY { - #we need to recurse to get the corresponding dict for the contained item(s) - #pass in the whole $found_sub - not just the $value! - set prev_tablenames_info $tablenames_info - set tablenames_info [dict create] - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - set tablenames_info $prev_tablenames_info - } - MULTISTRING - MULTILITERAL { - #review - mapping these to STRING might make some conversions harder? - #if we keep the MULTI - we know we have to look for newlines for example when converting to json - #without specific types we'd have to check every STRING - and lose info about how best to map chars within it - set result [list type $type value [::tomlish::to_dict [list $found_sub]]] - } - default { - error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" - } - } - return $result - } - - proc _get_dottedkey_info {dottedkeyrecord} { - set key_hierarchy [list] - set key_hierarchy_raw [list] - if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { - error "tomlish _get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" - } - set compoundkeylist [lindex $dottedkeyrecord 1] - set expect_sep 0 - foreach part $compoundkeylist { - set parttag [lindex $part 0] - if {$parttag eq "WS"} { - continue - } - if {$expect_sep} { - if {$parttag ne "DOTSEP"} { - error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" - } - set expect_sep 0 - } else { - set val [lindex $part 1] - switch -exact -- $parttag { - KEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw $val - } - DQKEY { - lappend key_hierarchy [::tomlish::utils::unescape_string $val] - lappend key_hierarchy_raw \"$val\" - } - SQKEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw "'$val'" - } - default { - error "DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" - } - } - set expect_sep 1 - } - } - return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] - } - - - - #to_dict is a *basic* programmatic datastructure for accessing the data. - # produce a dictionary of keys and values from a tomlish tagged list. - # to_dict is primarily for reading toml data. - #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, - # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. - # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. - #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. - # - - #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types - #(ARRAYS can be mixed type) - #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form - #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? - - #Namespacing? - #ie note the difference: - #[Data] - #temp = { cpu = 79.5, case = 72.0} - # versus - #[Data] - #temps = [{cpu = 79.5, case = 72.0}] - proc to_dict {tomlish} { - package require dictn - - #keep track of which tablenames have already been directly defined, - # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' - #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. - #we don't error out just because a previous tablename segment has already appeared. - - #Declaring, Creating, and Defining Tables - #https://github.com/toml-lang/toml/issues/795 - #(update - only Creating and Defining are relevant terminology) - - #review - #tablenames_info keys type created, defined, createdby, definedby, closedby ??? review keys - # [tname] = header_table [[tname]] = header_tablearray - - #consider the following 2 which are legal: - #[table] #'table' created, defined=open type header_table - #x.y = 3 - #[table.x.z] #'table' defined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header_table table.x.z} - #k= 22 - # #'table.x.z' defined=closed closedby={eof eof} - - #equivalent datastructure - - #[table] #'table' created, defined=open definedby={header_table table} - #[table.x] #'table' defined=closed closedby={header_table table.x}, 'table.x' created defined=open definedby={header_table table.x} - #y = 3 - #[table.x.z] #'table.x' defined=closed closedby={header_table table.x.z}, 'table.x.z' created defined=open definedby={header_table table.x.z} - #k=22 - - #illegal - #[table] #'table' created and defined=open - #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} - #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created - #k = 22 - # - ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - - - if {[uplevel 1 [list info exists tablenames_info]]} { - upvar tablenames_info tablenames_info - } else { - set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) - } - - - log::info "---> to_dict processing '$tomlish'<<<" - set items $tomlish - - foreach lst $items { - if {[lindex $lst 0] ni $::tomlish::tags} { - error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" - } - } - - if {[lindex $tomlish 0] eq "TOMLISH"} { - #ignore TOMLISH tag at beginning - set items [lrange $tomlish 1 end] - } - - set datastructure [dict create] - foreach item $items { - set tag [lindex $item 0] - #puts "...> item:'$item' tag:'$tag'" - switch -exact -- $tag { - KEY - DQKEY - SQKEY { - log::debug "---> to_dict item: processing $tag: $item" - set key [lindex $item 1] - if {$tag eq "DQKEY"} { - set key [::tomlish::utils::unescape_string $key] - } - #!todo - normalize key. (may be quoted/doublequoted) - - if {[dict exists $datastructure $key]} { - error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #lassign [_get_keyval_value $item] type val - set keyval_dict [_get_keyval_value $item] - dict set datastructure $key $keyval_dict - } - DOTTEDKEY { - log::debug "---> to_dict item processing $tag: $item" - set dkey_info [_get_dottedkey_info $item] - set dotted_key_hierarchy [dict get $dkey_info keys] - - #a.b.c = 1 - #table_key_hierarchy -> a b - #tleaf -> c - if {[llength $dotted_key_hierarchy] == 0} { - #empty?? probably invalid. review - #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively - error "DOTTED key has no parts - invalid? '$item'" - } elseif {[llength $dotted_key_hierarchy] == 1} { - #dottedkey is only a key - no table component - set table_hierarchy [list] - set tleaf [lindex $dotted_key_hierarchy 0] - } else { - set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set tleaf [lindex $dotted_key_hierarchy end] - } - - #ensure empty tables are still represented in the datastructure - #review - this seems unnecessary? - set pathkeys [list] - foreach k $table_hierarchy { - lappend pathkeys $k - if {![dict exists $datastructure {*}$pathkeys]} { - dict set datastructure {*}$pathkeys [list] - } else { - tomlish::log::notice "to_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" - } - } - #review? - if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { - error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #JMN test 2025 - if {[llength $table_hierarchy]} { - dictn incr tablenames_info [list $table_hierarchy seencount] - } - - set keyval_dict [_get_keyval_value $item] - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - set t [list {*}$table_hierarchy $tleaf] - dictn incr tablenames_info [list $t seencount] - dictn set tablenames_info [list $t closed] 1 - - #review - item is an ITABLE - we recurse here without datastructure context :/ - #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - } else { - dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - } - - } - TABLEARRAY { - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - - set NEST_DICT [dict create] ;#first blush attempt at referencing supertable tablearrays - set tablearrayname [lindex $item 1] - log::debug "---> to_dict processing item TABLENAME (name: $tablearrayname): $item" - set norm_segments [::tomlish::utils::tablename_split $tablearrayname true] ;#true to normalize - #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. - #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem - set supertable [list] - ############## - # [[a.b.c.d]] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this path doesn't yet exist - if {[dict exists $datastructure {*}$supertable]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_supertable_keycollision - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } else { - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - #REVIEW!! - # what happens with from_toml {[[a.b.c]]} {[[a]]} ??? - dictn set tablenames_info [list $supertable type] header_table ;#how do we know it's not going to be a tablearray? - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } else { - #supertable has already been created - and maybe defined - but even if defined we can add subtables unless it is of type itable - #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' - #(another way of saying last member of that array)?? - set supertype [dictn get $tablenames_info [list $supertable type]] - if {$supertype eq "header_tablearray"} { - puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" - puts stdout "todict!!! todo.." - #how to do multilevel nesting?? - set EXISTING_SUPERARRAY_ELEMENTS [dict get $datastructure {*}$supertable value] - dict set NEST_DICT $supertable $EXISTING_SUPERARRAY_ELEMENTS - puts stdout "todict!!! supertable '[join $supertable .]' elements $EXISTING_SUPERARRAY_ELEMENTS" - } - } - } - # - if {![dictn exists $tablenames_info [list $norm_segments type]]} { - #first encounter of this tablearrayname - if {[dict exists $datastructure {*}$norm_segments]} { - #e.g from_toml {a=1} {[[a]]} - set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_direct_keycollision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #no collision - we can create the tablearray and the array in the datastructure - dictn set tablenames_info [list $norm_segments type] header_tablearray - dict set datastructure {*}$norm_segments [list type ARRAY value {}] - set ARRAY_ELEMENTS [list] - } else { - #we have a table - but is it a tablearray? - set ttype [dictn get $tablenames_info [list $norm_segments type]] - #use a tabletype_unknown type for previous 'created' only tables? - if {$ttype ne "header_tablearray"} { - set msg "tablearray name $tablearrayname already appears to be already created as a table not a tablearray - invalid?" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #add to array - #error "add_to_array not implemented" - #{type ARRAY value } - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] - } - - - set object [dict create] ;#array context equivalent of 'datastructure' - set objectnames_info [dict create] ;#array contex equivalent of tablenames_info - - #add to ARRAY_ELEMENTS and write back in to datastructure. - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #MAINTENANCE: temp copy from TABLE - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [_get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - #set supertable $norm_segments - set supertable [list] ;#disconnect from main structure - each array element is a new context for key paths! - foreach normkey $dotparents { - lappend supertable $normkey - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - #our dotted key is attempting to define a table - if {![dictn exists $tablenames_info [list $dottedpath type]]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dictn set tablenames_info [list $dottedpath type] dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dictn get $tablenames_info [list $dottedpath type]] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list $dottedpath defined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } - #assert - dottedkey represents a key val pair that can be added - - - if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey '$dottedkeyname' kv: $keyval_dict" - #dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict - #wrong - #TODO!!!!!!!!!!!!! - #lappend ARRAY_ELEMENTS [list $dottedkeyname $keyval_dict] - dict set object $dottedkeyname $keyval_dict - - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] - #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] - - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW - - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 - } - - } - NEWLINE - COMMENT - WS { - #ignore - } - TABLE { - #we should be able to process tablearray subtables either as part of the tablearray record, or independently. - #(or even a mixture of both, although that is somewhat an edge case) - #[[fruit]] - #x=1 - # [fruit.metadata] - # [fruit.otherdata] - - #when processing a dict destined for the above - the tomlish generator (e.g from_dict) - #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) - #choices: all in tablearray record, tablearray + 1 or 2 table records. - # - #We are going the other way here - so we just need to realize the list of tables 'belonging' to this tablearray might not be complete. - # - #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records - - } - default { - error "Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" - } - } - } - - #todo? - ##end of TABLE record - equivalent of EOF or next header - close off the dottedtables - #foreach dtablepath $dottedtables_defined { - # dictn set tablename_info [list $dtablepath defined] closed - #} - - if {[dict size $NEST_DICT]} { - puts "reintegrate?? $NEST_DICT" - #todo - more - what if multiple in hierarchy? - dict for {superpath existing_elements} $NEST_DICT { - #objects stored directly as dicts in ARRAY value - set lastd [lindex $existing_elements end] - #insufficient.. - #dict set lastd [lindex $norm_segments end] [list type ITABLE value $object] - dict set lastd [lindex $norm_segments end] $object - #set lastd [dict merge $lastd $object] - lset existing_elements end $lastd - dict set datastructure {*}$superpath [list type ARRAY value $existing_elements] - } - } else { - #lappend ARRAY_ELEMENTS [list type ITABLE value $object] - lappend ARRAY_ELEMENTS $object - dict set datastructure {*}$norm_segments [list type ARRAY value $ARRAY_ELEMENTS] - } - } - TABLE { - set tablename [lindex $item 1] - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. - - log::debug "---> to_dict processing item TABLE (name: $tablename): $item" - #set tablename [::tomlish::utils::tablename_trim $tablename] - set norm_segments [::tomlish::utils::tablename_split $tablename true] ;#true to normalize - - set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] - if {$T_DEFINED ne "NULL"} { - #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path - set msg "Table name $tablename has already been directly defined in the toml data. Invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg - } - - - set name_segments [::tomlish::utils::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d - #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. - - - set supertable [list] - ############## - # [a.b.c.d] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this path doesn't yet exist - if {[dict exists $datastructure {*}$supertable]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of table name $tablename already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] header_table - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } else { - #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable - - } - } - #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename - # - or may have existing data from a keyval - if {![dictn exists $tablenames_info [list $norm_segments type]]} { - if {[dict exists $datastructure {*}$norm_segments]} { - #e.g from_toml {a=1} {[a]} - set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablename_keyval_collision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #no data or previously created table - dictn set tablenames_info [list $norm_segments type] header_table - - #We are 'defining' this table's keys and values here (even if empty) - dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here - } - dictn set tablenames_info [list $norm_segments defined] open - log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" - - #now add the contained elements - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [_get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - set supertable $norm_segments - foreach normkey $dotparents { - lappend supertable $normkey - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - #our dotted key is attempting to define a table - if {![dictn exists $tablenames_info [list $dottedpath type]]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dictn set tablenames_info [list $dottedpath type] dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dictn get $tablenames_info [list $dottedpath type]] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list $dottedpath defined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } - #assert - dottedkey represents a key val pair that can be added - - - if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey $dottedkeyname kv: $keyval_dict" - dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict - - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] - #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] - - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW - - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 - } - - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" - } - } - } - - #end of TABLE record - equivalent of EOF or next header - close off the dottedtables - foreach dtablepath $dottedtables_defined { - dictn set tablename_info [list $dtablepath defined] closed - } - - - #review??? - #now make sure we add an empty value if there were no contained elements! - #!todo. - } - ITABLE { - #SEP??? - set datastructure [list] - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [_get_dottedkey_info $element] - set dotted_key_hierarchy [dict get $dkey_info keys] - set leaf_key [lindex $dotted_key_hierarchy end] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - - #ensure empty keys are still represented in the datastructure - set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - } - ARRAY { - #arrays in toml are allowed to contain mixtures of types - set datastructure [list] - log::debug "--> processing array: $item" - - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - STRING { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - ITABLE { - #anonymous table - #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - lappend datastructure [::tomlish::to_dict [list $element]] ;#store itables within arrays as raw dicts (possibly empty) - } - TABLE { - #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review - #doesn't make sense as table needs a name? - #take as synonym for ITABLE? - error "to_dict TABLE within array unexpected" - } - ARRAY - MULTISTRING - MULTILITERAL { - #set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - } - WS - SEP - NEWLINE - COMMENT { - #ignore whitespace, commas, newlines and comments - } - default { - error "Unexpected value type '$type' found in array" - } - } - } - } - MULTILITERAL { - #triple squoted string - #first newline stripped only if it is the very first element - #(ie *immediately* following the opening delims) - #All whitespace other than newlines is within LITERALPARTS - # ------------------------------------------------------------------------- - #todo - consider extension to toml to allow indent-aware multiline literals - # how - propose as issue in toml github? Use different delim? e.g ^^^ ? - #e.g - # xxx=?'''abc - # def - # etc - # ''' - # - we would like to trimleft each line to the column following the opening delim - # ------------------------------------------------------------------------- - - log::debug "---> todict processing multiliteral: $item" - set parts [lrange $item 1 end] - if {[lindex $parts 0 0] eq "NEWLINE"} { - set parts [lrange $parts 1 end] ;#skip it - } - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - switch -exact -- $type { - LITERALPART { - append stringvalue [lindex $element 1] - } - NEWLINE { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - MULTISTRING { - #triple dquoted string - log::debug "---> to_dict processing multistring: $item" - set stringvalue "" - set idx 0 - set parts [lrange $item 1 end] - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted - switch -exact -- $type { - STRING { - #todo - do away with STRING ? - #we don't build MULTISTRINGS containing STRING - but should we accept it? - tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" - append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" - } - STRINGPART { - append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] - } - CONT { - #When the last non-whitespace character on a line is an unescaped backslash, - #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter - # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - append stringvalue "\\" ;#add the sep - } else { - #skip over ws without emitting - set idx [llength $parts] - } - } else { - set parts_til_nl [lrange $parts 0 $next_nl-1] - set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] - if {$non_ws >= 0} { - append stringvalue "\\" - } else { - #skip over ws on this line - set idx $next_nl - #then have to check each subsequent line until we get to first non-whitespace - set trimming 1 - while {$trimming && $idx < [llength $parts]} { - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - } else { - set idx [llength $parts] - } - set trimming 0 - } else { - set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - set trimming 0 - } else { - set idx $next_nl - #keep trimming - } - } - } - } - } - } - NEWLINE { - #if newline is first element - it is not part of the data of a multistring - if {$idx > 0} { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - } - WS { - append stringvalue [lindex $element 1] - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - WS - COMMENT - NEWLINE { - #ignore - } - default { - error "Unexpected tag '$tag' in Tomlish list '$tomlish'" - } - } - } - return $datastructure - } - - - proc _from_dictval_tomltype {parents tablestack keys typeval} { - set type [dict get $typeval type] - set val [dict get $typeval value] - switch -- $type { - ARRAY { - set subitems [list] - foreach item $val { - lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP - } - if {[lindex $subitems end] eq "SEP"} { - set subitems [lrange $subitems 0 end-1] - } - return [list ARRAY {*}$subitems] - } - ITABLE { - if {$val eq ""} { - return ITABLE - } else { - return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] - } - } - MULTISTRING { - #value is a raw string that isn't encoded as tomlish - #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format - #We need to convert controls in $val to escape sequences - except for newlines - # - #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) - #we could use a line-length limit to decide when to put in a "line ending backslash" - #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW - # - #TODO - set tomlpart "x=\"\"\"\\\n" - append tomlpart $val "\"\"\"" - set tomlish [tomlish::decode::toml $tomlpart] - #e.g if val = " etc\nblah" - #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } - #lindex 1 3 is the MULTISTRING tomlish list - return [lindex $tomlish 1 3] - } - MULTILITERAL { - #MLL string can contain newlines - but still no control chars - #todo - validate - set tomlpart "x='''\n" - append tomlpart $val ''' - set tomlish [tomlish::decode::toml $tomlpart] - return [lindex $tomlish 1 3] - } - LITERAL { - #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" - #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format - # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be - # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) - #we could choose to change the type to another format here when encountering invalid chars - but that seems - #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. - if {[string first ' $val] >=0} { - error "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" - } - #detect control chars other than tab - #for this we can use rawstring_to_Bstring_with_escaped_controls - even though this isn't a Bstring - #we are just using the map to detect a difference. - set testval [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val] - if {$testval ne $val} { - #some escaping would have to be done if this value was destined for a Bstring... - #therefor this string has controls and isn't suitable for a LITERAL according to the specs. - error "_from_dictval_tomltype error: control chars (other than tab) found in LITERAL value - cannot encode dict to TOML-VALID TOMLISH" - } - return [list LITERAL $val] - } - STRING { - return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] - } - INT { - if {![::tomlish::utils::is_int $val]} { - error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list INT $val] - } - FLOAT { - if {![::tomlish::utils::is_float $val]} { - error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list FLOAT $val] - } - default { - if {$type ni [::tomlish::tags]} { - error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" - } - return [list $type $val] - } - } - } - - #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string - proc _from_dict_classify_key {rawval} { - if {![::tomlish::utils::is_barekey $rawval]} { - #requires quoting - # - #Any dot in the key would have been split by to_dict - so if it's present here it's part of this key - not a level separator! - # - #we'll use a basic mechanisms for now to determine the type of quoting - # - whether it has any single quotes or not. - # (can't go in an SQKEY) - # - whether it has any chars that require quoting when in a Bstring - # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) - #todo - more? - #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY - # from literal examples: - # 'c:\Users\nodejs\templates' - # '<\i\c*\s*>' - #If these are in *keys* our basic test will express these as: - # "c:\\Users\\nodejs\\templates" - # "<\\i\\c*\\s*>" - # This still works - but a smarter test might determine when SQKEY is the better form? - #when coming from external systems - can we even know if the value was already escaped? REVIEW - #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped - #TODO - clarify in documentation that keys resulting from to_dict are in 'normalized' (unescaped) form - # - #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) - set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] - if {[string length $k_escaped] != [string length $rawval]} { - #escaping made a difference - set has_escape_requirement 1 - } else { - set has_escape_requirement 0 - } - if {[string first ' $rawval] >=0 || $has_escape_requirement} { - #basic string - # (any ANSI SGR sequence will end up here in escaped form ) - return [list DQKEY $k_escaped] - } else { - #literal string - return [list SQKEY $rawval] - } - } else { - return [list KEY $rawval] - } - } - - #the quoting implies the necessary escaping for DQKEYs - proc _from_dict_join_and_quote_raw_keys {rawkeylist} { - set result "" - foreach rk $rawkeylist { - lassign [_from_dict_classify_key $rk] type val - switch -- $type { - SQKEY { - append result "'$val'." - } - DQKEY { - append result "\"$val\"." - } - KEY { - append result "$val." - } - } - } - return [string range $result 0 end-1] - } - proc _from_dictval {parents tablestack keys vinfo} { - set k [lindex $keys end] - set K_PART [_from_dict_classify_key $k] ;#get [list SQKEY ] - puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" - puts stderr "---tablestack: $tablestack---" - set result [list] - set lastparent [lindex $parents end] - if {$lastparent in [list "" do_inline]} { - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - set type [dict get $vinfo type] - #treat ITABLE differently? - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} - } else { - if {$vinfo ne ""} { - - #set result [list DOTTEDKEY [list [list KEY $k]] = ] - #set records [list ITABLE] - - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - - if {$lastparent eq "do_inline"} { - set result [list DOTTEDKEY [list $K_PART] =] - set records [list ITABLE] - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - set result [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $k]] - set records [list] - } - - - - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) - #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { - # set VK_PART [list SQKEY $vk] - #} else { - # set VK_PART [list KEY $vk] - #} - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - #REVIEW - we could detect if value is an array of objects, - #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] - } else { - if {$vv eq ""} { - #experimental - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" - #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] - - #we can't just join normalized keys - need keys with appropriate quotes and escapes - #set tname [join [list {*}$keys $vk] .] ;#WRONG - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - } else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - set tablestack [list {*}$tablestack [list I $vk]] - } - } else { - if { 0 } { - #experiment.. sort of getting there. - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - - #review - todo? - set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] - lappend record {*}$dottedkey_value - - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - } - if {$dictidx != $lastidx} { - #lappend record SEP - if {$lastparent eq "do_inline"} { - lappend record SEP - } else { - lappend record {NEWLINE lf} - } - } - lappend records $record - incr dictidx - } - if {$lastparent eq "do_inline"} { - lappend result $records {NEWLINE lf} - } else { - lappend result {*}$records {NEWLINE lf} - } - } else { - if {$lastparent eq "do_inline"} { - lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - lappend result TABLE $tname {NEWLINE lf} - } - } - } - } else { - #lastparent is not toplevel "" or "do_inline" - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result {*}$sublist - } else { - if {$lastparent eq "TABLE"} { - #review - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] - lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] - } - } else { - if {$vinfo ne ""} { - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - set sub [list] - #REVIEW - #set result $lastparent ;#e.g sets ITABLE - set result ITABLE - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART] = $sublist] - } else { - if {$vv eq ""} { - #can't just uninline at this level - #we need a better method to query main dict for uninlinability at each level - # (including what's been inlined already) - #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - # puts stderr "_from_dictval uninline2 KEY $keys" - # set tname [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - # set record [list TABLE $tname {NEWLINE lf}] - # set tablestack [list {*}$tablestack [list T $vk]] - #} else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - #} - } else { - #set sub [_from_dictval ITABLE $vk $vv] - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - if {$dictidx != $lastidx} { - lappend record SEP - } - lappend result $record - incr dictidx - } - } else { - puts stderr "table x-1" - lappend result DOTTEDKEY [list $K_PART] = ITABLE - } - } - } - } - return $result - } - - - proc from_dict {d} { - #consider: - # t1={a=1,b=2} - # x = 1 - #If we represent t1 as an expanded table we get - # [t1] - # a=1 - # b=2 - # x=1 - # --- which is incorrect - as x was a toplevel key like t1! - #This issue doesn't occur if x is itself an inline table - # t1={a=1,b=2} - # x= {no="problem"} - # - # (or if we were to reorder x to come before t1) - - #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} - #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, - #which is unpreferred here. - - #A possible solution: - #scan the top level to see if all (trailing) elements are themselves dicts - # (ie not of form {type XXX value yyy}) - # - # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements - #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - - #set root_has_values 0 - #approach 1) - the naive approach - forces inline when not always necessary - #dict for {k v} $d { - # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { - # set root_has_values 1 - # break - # } - #} - - - #approach 2) - track the position of last {type x value y} in the dictionary built by to_dict - # - still not perfect. Inlines dotted tables unnecessarily - #This means from_dict doesn't produce output optimal for human editing. - set last_simple [tomlish::dict::last_tomltype_posn $d] - - - ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values - #Any keys that are themselves tables - will need to be represented inline - #to avoid reordering, or incorrect assignment of plain values to the wrong table. - - ## set parent "" - #all toplevel keys in the dict structure can represent subtables. - #we are free to use {[tablename]\n} syntax for toplevel elements. - - - set tomlish [list TOMLISH] - set dictposn 0 - set tablestack [list [list T root]] ;#todo - dict for {t tinfo} $d { - if {$last_simple > $dictposn} { - set parents [list do_inline] - } else { - set parents [list ""] - } - set keys [list $t] - #review - where to make decision on - # DOTTEDKEY containing array of objs - #vs - # list of TABLEARRAY records - #At least for the top - set trecord [_from_dictval $parents $tablestack $keys $tinfo] - lappend tomlish $trecord - incr dictposn - } - return $tomlish - } - - proc json_to_toml {json} { - #*** !doctools - #[call [fun json_to_toml] [arg json]] - #[para] - - set tomlish [::tomlish::from_json $json] - set toml [::tomlish::to_toml $tomlish] - } - - #TODO use huddle? - proc from_json {json} { - #set jstruct [::tomlish::json_struct $json] - #return [::tomlish::from_json_struct $jstruct] - package require huddle - package require huddle::json - set h [huddle::json::json2huddle parse $json] - - } - - proc from_json_struct {jstruct} { - package require fish::json_toml - return [fish::json_toml::jsonstruct2tomlish $jstruct] - } - - proc toml_to_json {toml} { - set tomlish [::tomlish::from_toml $toml] - return [::tomlish::get_json $tomlish] - } - - proc get_json {tomlish} { - package require fish::json - set d [::tomlish::to_dict $tomlish] - - #return [::tomlish::dict_to_json $d] - return [fish::json::from "struct" $d] - } - - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -namespace eval tomlish::build { - #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness - # take a value of the appropriate type and wrap as a tomlish tagged item - proc STRING {s} { - return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] - } - proc LITERAL {litstring} { - error todo - } - - proc INT {i} { - #whole numbers, may be prefixed with a + or - - #Leading zeros are not allowed - #Hex,octal binary forms are allowed (toml 1.0) - #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) - #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. - # - We should probably raise an error for number larger than this and suggest the user supply it as a string? - if {[tcl::string::last , $i] > -1} { - error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" - } - if {![::tomlish::utils::int_validchars $i]} { - error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" - } - - if {[::tomlish::utils::is_int $i]} { - return [list INT $i] - } else { - error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" - } - - } - - proc FLOAT {f} { - #convert any non-lower case variants of special values to lowercase for Toml - if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { - return [list FLOAT [tcl::string::tolower $f]] - } - if {[::tomlish::utils::is_float $f]} { - return [list FLOAT $f] - } else { - error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" - } - } - - proc DATETIME {str} { - if {[::tomlish::utils::is_datetime $str]} { - return [list DATETIME $str] - } else { - error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" - } - } - - proc BOOLEAN {b} { - #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false - if {![tcl::string::is boolean -strict $b]} { - error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" - } else { - if {$b && 1} { - return [::list BOOL true] - } else { - return [::list BOOL false] - } - } - } - - #REVIEW - #Take tablename followed by - # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} - # (accept also key value {STRING }) - # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types - proc _table {name args} { - set pairs [list] - foreach t $args { - if {[llength $t] == 4} { - if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { - error "Only items tagged as KEY = currently accepted as name-value pairs for table command" - } - lassign $t _k keystr _eq valuepart - if {[llength $valuepart] != 2} { - error "supplied value must be typed. e.g {INT 1} or {STRING test}" - } - lappend pairs [list KEY $keystr = $valuepart] - } elseif {[llength $t] == 2} { - #!todo - type heuristics - lassign $t n v - lappend pairs [list KEY $n = [list STRING $v]] - } else { - error "'KEY = { toml but - # the first newline is not part of the data. - # we elect instead to maintain a basic LITERALPART that must not contain newlines.. - # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, - #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. - set literal "" - foreach part [lrange $item 1 end] { - append literal [::tomlish::encode::tomlish [list $part] $nextcontext] - } - append toml '''$literal''' - } - INT - - BOOL - - FLOAT - - DATETIME { - append toml [lindex $item 1] - } - INCOMPLETE { - error "cannot process tomlish term tagged as INCOMPLETE" - } - COMMENT { - append toml "#[lindex $item 1]" - } - BOM { - #Byte Order Mark may appear at beginning of a file. Needs to be preserved. - append toml "\uFEFF" - } - default { - error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." - } - } - - } - return $toml - } - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] -} -#fish toml from tomlish - -#(encode tomlish as toml) -interp alias {} tomlish::to_toml {} tomlish::encode::tomlish - -# - - -namespace eval tomlish::decode { - #*** !doctools - #[subsection {Namespace tomlish::decode}] - #[para] - #[list_begin definitions] - - #return a Tcl list of tomlish tokens - #i.e get a standard list of all the toml terms in string $s - #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. - #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) - # ---------------------------------------------------------------------------------------------- - # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! - # e.g we deliberately don't check certain things such as duplicate table declarations here. - # ---------------------------------------------------------------------------------------------- - #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. - # (e.g perhaps a toml editor to highlight violations for fixing) - # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. - # e.g dicts or an object oriented structure - #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage - #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc - #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. - # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) - #If we were to unescape a tab character for example - # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. - # For this reason, we also do absolutely no line-ending transformations based on platform. - # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' - - proc toml {args} { - #*** !doctools - #[call [fun toml] [arg arg...]] - #[para] return a Tcl list of tomlish tokens - - set s [join $args \n] - - namespace upvar ::tomlish::parse is_parsing is_parsing - set is_parsing 1 - - if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { - tomlish::parse::spacestack destroy - } - struct::stack ::tomlish::parse::spacestack - - namespace upvar ::tomlish::parse last_space_action last_space_action - namespace upvar ::tomlish::parse last_space_type last_space_type - - namespace upvar ::tomlish::parse tok tok - set tok "" - - namespace upvar ::tomlish::parse type type - namespace upvar ::tomlish::parse tokenType tokenType - ::tomlish::parse::set_tokenType "" - namespace upvar ::tomlish::parse tokenType_list tokenType_list - set tokenType [list] ;#Flat (un-nested) list of tokentypes found - - namespace upvar ::tomlish::parse lastChar lastChar - set lastChar "" - - - set result "" - namespace upvar ::tomlish::parse nest nest - set nest 0 - - namespace upvar ::tomlish::parse v v ;#array keyed on nest level - - - set v(0) {TOMLISH} - array set s0 [list] ;#whitespace data to go in {SPACE {}} element. - set parentlevel 0 - - namespace upvar ::tomlish::parse i i - set i 0 - - namespace upvar ::tomlish::parse state state - - namespace upvar ::tomlish::parse braceCount braceCount - set barceCount 0 - namespace upvar ::tomlish::parse bracketCount bracketCount - set bracketCount 0 - - set sep 0 - set r 1 - namespace upvar ::tomlish::parse token_waiting token_waiting - set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. - - - set state "table-space" - ::tomlish::parse::spacestack push {type space state table-space} - namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) - set linenum 1 - - set ::tomlish::parse::state_list [list] - try { - while {$r} { - set r [::tomlish::parse::tok $s] - #puts stdout "got tok: '$tok' while parsing string '$s' " - set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' - - - #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" - #puts "-->tok: $tok tokenType='$tokenType'" - set prevstate $state - set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] - #review goNextState could perform more than one space_action - set space_action [dict get $transition_info space_action] - set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below - - if {[tcl::string::match "err-*" $state]} { - ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" - lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] - return $v(0) - } - # --------------------------------------------------------- - #NOTE there may already be a token_waiting at this point - #set_token_waiting can raise an error here, - # in which case the space_action branch needs to be rewritten to handle the existing token_waiting - # --------------------------------------------------------- - - if {$space_action eq "pop"} { - #pop_trigger_tokens: newline tablename endarray endinlinetable - #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. - set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append - switch -exact -- $tokenType { - tentative_accum_squote { - #should only apply within a multiliteral - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the xxx-squote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-squote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-squote-space { - } - default { - error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" - } - } - switch -- $tok { - ' { - tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] - } - '' { - #review - we should perhaps return double_squote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] - } - ''' { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] - } - '''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } - } - ''''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } - } - } - } - triple_squote { - #presumably popping multiliteral-space - ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTILITERAL { - lappend merged $part - } - LITERALPART { - if {$lasttype eq "LITERALPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - tentative_accum_dquote { - #should only apply within a multistring - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the trailing-dquote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-dquote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-dquote-space { - } - default { - error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" - } - } - switch -- $tok { - {"} { - tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] - } - {""} { - #review - we should perhaps return double_dquote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] - } - {"""} { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] - } - {""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left dquote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {"}] - } - MULTISTRING { - #empty - lappend v($parentlevel) [list STRINGPART {"}] - } - default { - error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" - } - } - } - {"""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 dquotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {""}] - } - MULTISTRING { - lappend v($parentlevel) [list STRINGPART {""}] - } - default { - error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" - } - } - } - } - } - triple_dquote { - #presumably popping multistring-space - ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTISTRING { - lappend merged $part - } - STRINGPART { - if {$lasttype eq "STRINGPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - CONT - WS { - lappend merged $part - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - equal { - #pop caused by = - switch -exact -- $prevstate { - dottedkey-space { - tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - dottedkey-space-tail { - #experiment? - tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - } - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - tablename { - #note: a tablename only 'pops' if we are greater than zero - error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" - } - tablearrayname { - #!review - tablearrayname different to tablename regarding push/pop? - #note: a tablename only 'pops' if we are greater than zero - error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" - } - endarray { - #nothing to do here. - } - comma { - #comma for inline table will pop the keyvalue space - lappend v($nest) "SEP" - } - endinlinetable { - ::tomlish::log::debug "---- endinlinetable for last_space_action pop" - } - default { - error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" - } - } - if {$do_append_to_parent} { - #e.g tentative_accum_squote does it's own appends as necessary - so won't get here - lappend v($parentlevel) [set v($nest)] - } - - incr nest -1 - - } elseif {$last_space_action eq "push"} { - set prevnest $nest - incr nest 1 - set v($nest) [list] - # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname - - - switch -exact -- $tokenType { - tentative_trigger_squote - tentative_trigger_dquote { - #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote - if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { - lassign [dict get $transition_info starttok] starttok_type starttok_val - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType $starttok_type - set tok $starttok_val - } - } - single_squote { - #JMN - REVIEW - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART - } - squotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - triple_dquote { - set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT - } - dquotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - barekey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - set waiting [tomlish::parse::get_token_waiting] - if {[llength $waiting]} { - set i [dict get $waiting startindex] - tomlish::parse::clear_token_waiting - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } else { - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - } - tablename { - #note: we do not use the output of tomlish::tablename_trim to produce a tablename for storage in the tomlish list! - #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish - # back to toml file will be identical. - #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. - # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from - # a structural perspective. - - #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, - # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the - # tomlish list? - - #set trimtable [::tomlish::utils::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" - set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name - #note also that equivalent tablenames may have different toml representations even after being trimmed! - #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) - #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. - } - tablearrayname { - #set trimtable [::tomlish::utils::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" - set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name - } - startarray { - set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. - } - startinlinetable { - set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. - } - default { - error "---- push trigger tokenType '$tokenType' not yet implemented" - } - } - - } else { - #no space level change - switch -exact -- $tokenType { - squotedkey { - #puts "---- squotedkey in state $prevstate (no space level change)" - lappend v($nest) [list SQKEY $tok] - } - dquotedkey { - #puts "---- dquotedkey in state $prevstate (no space level change)" - lappend v($nest) [list DQKEY $tok] - } - barekey { - lappend v($nest) [list KEY $tok] - } - dotsep { - lappend v($nest) [list DOTSEP] - } - starttablename { - #$tok is triggered by the opening bracket and sends nothing to output - } - starttablearrayname { - #$tok is triggered by the double opening brackets and sends nothing to output - } - tablename - tablenamearray { - error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" - #set v($nest) [list TABLE $tok] - } - endtablename - endtablearrayname { - #no output into the tomlish list for this token - } - startinlinetable { - puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" - } - single_dquote { - switch -exact -- $newstate { - string-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "string" - set tok "" - } - dquoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "dquotedkey" - set tok "" - } - multistring-space { - lappend v($nest) [list STRINGPART {"}] - #may need to be joined on pop if there are neighbouring STRINGPARTS - } - default { - error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - double_dquote { - #leading extra quotes - test: toml_multistring_startquote2 - switch -exact -- $prevstate { - itable-keyval-value-expected - keyval-value-expected { - puts stderr "tomlish::decode::toml double_dquote TEST" - #empty string - lappend v($nest) [list STRINGPART ""] - } - multistring-space { - #multistring-space to multistring-space - lappend v($nest) [list STRINGPART {""}] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - - } - single_squote { - switch -exact -- $newstate { - literal-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "literal" - set tok "" - } - squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - multiliteral-space { - #false alarm squote returned from tentative_accum_squote pop - ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" - #(single squote - not terminating space) - lappend v($nest) [list LITERALPART '] - #may need to be joined on pop if there are neighbouring LITERALPARTs - } - default { - error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - double_squote { - switch -exact -- $prevstate { - keyval-value-expected { - lappend v($nest) [list LITERAL ""] - } - multiliteral-space { - #multiliteral-space to multiliteral-space - lappend v($nest) [list LITERALPART ''] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - enddquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } - multistring { - #review - lappend v($nest) [list MULTISTRING $tok] - } - stringpart { - lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly - } - multiliteral { - lappend v($nest) [LIST MULTILITERAL $tok] - } - literalpart { - lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - untyped_value { - #would be better termed unclassified_value - #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. - unset -nocomplain tag - if {$tok in {true false}} { - set tag BOOL - } else { - if {[::tomlish::utils::is_int $tok]} { - set tag INT - } else { - if {[string is integer -strict $tok]} { - #didn't qualify as a toml int - but still an int - #probably means is_int is limiting size and not accepting bigints (configurable?) - #or it didn't qualify due to more than 1 leading zero - #or other integer format issue such as repeated underscores - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" - } else { - if {[::tomlish::utils::is_float $tok]} { - set tag FLOAT - } elseif {[::tomlish::utils::is_datetime $tok]} { - set tag DATETIME - } else { - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" - } - } - } - } - #assert either tag is set, or we errored out. - lappend v($nest) [list $tag $tok] - - } - comment { - #puts stdout "----- comment token returned '$tok'------" - lappend v($nest) [list COMMENT "$tok"] - } - equal { - #we append '=' to the nest so that any surrounding whitespace is retained. - lappend v($nest) = - } - comma { - lappend v($nest) SEP - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - whitespace { - lappend v($nest) [list WS $tok] - } - continuation { - lappend v($nest) CONT - } - bom { - lappend v($nest) BOM - } - eof { - #ok - nothing more to add to the tomlish list. - #!todo - check previous tokens are complete/valid? - } - default { - error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - - if {!$next_tokenType_known} { - ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" - ::tomlish::parse::set_tokenType "" - set tok "" - } - - if {$state eq "end-state"} { - break - } - - - } - - #while {$nest > 0} { - # lappend v([expr {$nest -1}]) [set v($nest)] - # incr nest -1 - #} - while {[::tomlish::parse::spacestack size] > 1} { - ::tomlish::parse::spacestack pop - lappend v([expr {$nest -1}]) [set v($nest)] - incr nest -1 - - #set parent [spacestack peek] ;#the level being appended to - #lassign $parent type state - #if {$type eq "space"} { - # - #} elseif {$type eq "buffer"} { - # lappend v([expr {$nest -1}]) {*}[set v($nest)] - #} else { - # error "invalid spacestack item: $parent" - #} - } - - } finally { - set is_parsing 0 - } - return $v(0) - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] -} -#decode toml to tomlish -interp alias {} tomlish::from_toml {} tomlish::decode::toml - -namespace eval tomlish::utils { - #*** !doctools - #[subsection {Namespace tomlish::utils}] - #[para] - #[list_begin definitions] - - - #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace - # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] - #trimmed, the tablename becomes {a.b.c} - # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] - #ie whitespace is only irrelevant if it's outside a quoted segment - #trimmed, the tablename becomes {a.b."c etc "} - proc tablename_trim {tablename} { - set segments [tablename_split $tablename false] - set trimmed_segments [list] - foreach seg $segments { - lappend trimmed_segments [::string trim $seg " \t"] - } - return [join $trimmed_segments .] - } - - #basic generic quote matching for single and double quotes - #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes - proc tok_in_quotedpart {tok} { - set sLen [tcl::string::length $tok] - set quote_type "" - set had_slash 0 - for {set i 0} {$i < $sLen} {incr i} { - set c [tcl::string::index $tok $i] - if {$quote_type eq ""} { - if {$had_slash} { - #don't enter quote mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - set quote_type dq - } - sq { - set quote_type sq - } - bsl { - set had_slash 1 - } - } - } - } else { - if {$had_slash} { - #don't leave quoted mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - if {$quote_type eq "dq"} { - set quote_type "" - } - } - sq { - if {$quote_type eq "sq"} { - set quote_type "" - } - } - bsl { - set had_slash 1 - } - } - } - } - } - return $quote_type ;#dq | sq - } - - #utils::tablename_split - proc tablename_split {tablename {normalize false}} { - #we can't just split on . because we have to handle quoted segments which may contain a dot. - #eg {dog."tater.man"} - set sLen [tcl::string::length $tablename] - set segments [list] - set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval - #quoted is for double-quotes, litquoted is for single-quotes (string literal) - set seg "" - for {set i 0} {$i < $sLen} {incr i} { - - if {$i > 0} { - set lastChar [tcl::string::index $tablename [expr {$i - 1}]] - } else { - set lastChar "" - } - - #todo - track\count backslashes properly - - set c [tcl::string::index $tablename $i] - if {$c eq "\""} { - if {($lastChar eq "\\")} { - #not strictly correct - we could have had an even number prior-backslash sequence - #the toml spec would have us error out immediately on bsl in bad location - but we're - #trying to parse to unvalidated tomlish - set ctest escq - } else { - set ctest dq - } - } else { - set ctest [string map [list " " sp \t tab] $c] - } - - switch -- $ctest { - . { - switch -exact -- $mode { - preval { - error "tablename_split. dot not allowed - expecting a value" - } - unquoted { - #dot marks end of segment. - #if {![is_barekey $seg]} { - # error "tablename_split. dot not allowed - expecting a value" - #} - lappend segments $seg - set seg "" - set mode "preval" - } - quoted { - append seg $c - } - litquoted { - append seg $c - } - postval { - #got dot in an expected location - set mode "preval" - } - } - } - dq { - #unescaped dquote - switch -- $mode { - preval { - set mode "quoted" - set seg "\"" - } - unquoted { - #invalid in barekey - but we are after structure only - append seg $c - } - quoted { - append seg $c - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg - } - set seg "" - set mode "postval" ;#make sure we only accept a dot or end-of-data now. - } - litquoted { - append seg $c - } - postval { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } - } - ' { - switch -- $mode { - preval { - append seg $c - set mode "litquoted" - } - unquoted { - #single quote inside e.g o'neill - ultimately invalid - but we pass through here. - append seg $c - } - quoted { - append seg $c - } - litquoted { - append seg $c - #no normalization to do aside from stripping squotes - if {$normalize} { - lappend segments [tcl::string::range $seg 1 end-1] - } else { - lappend segments $seg - } - set seg "" - set mode "postval" - } - postval { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" - } - } - } - sp - tab { - switch -- $mode { - preval - postval { - #ignore - } - unquoted { - #terminates a barekey - lappend segments $seg - set seg "" - set mode "postval" - } - default { - #append to quoted or litquoted - append seg $c - } - } - } - default { - switch -- $mode { - preval { - set mode unquoted - append seg $c - } - postval { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" - } - default { - append seg $c - } - } - } - } - - if {$i == $sLen-1} { - #end of data - ::tomlish::log::debug "End of data: mode='$mode'" - switch -exact -- $mode { - preval { - error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" - } - unquoted { - lappend segments $seg - } - quoted { - error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" - } - litquoted { - error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" - } - postval { - #ok - segment already lappended - } - } - } - } - - #note - we must allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - - return $segments - } - - proc unicode_escape_info {slashu} { - #!todo - # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and - # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) - # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive - #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} - if {[tcl::string::match {\\u*} $slashu]} { - set exp {^\\u([0-9a-fA-F]{4}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %4x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } - } else { - return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] - } - } elseif {[tcl::string::match {\\U*} $slashu]} { - set exp {^\\U([0-9a-fA-F]{8}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %8x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } else { - return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] - } - } - } else { - return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] - } - } else { - return [list err [list reason "Supplied string did not start with \\u or \\U" ]] - } - - } - - #Note that unicode characters don't *have* to be escaped. - #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. - #- an inverse of unescape_string would encode all unicode chars unnecessarily. - #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc - #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. - #REVIEW - provide it anyway? When would it be desirable to use? - - variable Bstring_control_map [dict create] - dict set Bstring_control_map \b {\b} - dict set Bstring_control_map \n {\n} - dict set Bstring_control_map \r {\r} - dict set Bstring_control_map \" {\"} - dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. - dict set Bstring_control_map \\ "\\\\" - - #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ - #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - #8 = \b - already in list. - #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list - for {set cdec 0} {$cdec <= 7} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh - } - } - for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh - } - } - # \u007F = 127 - dict set Bstring_control_map [format %c 127] \\u007F - - #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! - #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) - #for example - can be used by from_dict to produce valid Bstring data for a tomlish record - proc rawstring_to_Bstring_with_escaped_controls {str} { - #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. - #we'll use a string map with an explicit list rather than algorithmic at runtime - # - the string map is probably more performant than splitting a string, especially if it's large - variable Bstring_control_map - return [string map $Bstring_control_map $str] - } - - #review - unescape what string? Bstring vs MLBstring? - #we should be specific in the function naming here - #used by to_dict - so part of validation? - REVIEW - proc unescape_string {str} { - #note we can't just use Tcl subst because: - # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. - # it would strip out backslashes inappropriately: e.g "\j" becomes just j - # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn - # it replaces \ with a single whitespace (trailing backslash) - #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh - #plus \e for \x1b? - - set buffer "" - set buffer4 "" ;#buffer for 4 hex characters following a \u - set buffer8 "" ;#buffer for 8 hex characters following a \u - - set sLen [tcl::string::length $str] - - #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc - set slash_active 0 - set unicode4_active 0 - set unicode8_active 0 - - ::tomlish::log::debug "unescape_string. got len [string length str] str $str" - - #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? - set i 0 - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $str [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $str $i] - #::tomlish::log::debug "unescape_string. got char $c" ;#too much? - - #---------------------- - #as we are 'unescaping' - should we really be testing for existing values that should have been escaped? - #this test looks incomplete anyway REVIEW - scan $c %c n - if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { - #we don't expect unescaped unicode characters from 0000 to 001F - - #*except* for raw tab (which is whitespace) and newlines - error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" - } - #---------------------- - - incr i ;#must incr here because we do'returns'inside the loop - if {$c eq "\\"} { - if {$slash_active} { - append buffer "\\" - set slash_active 0 - } elseif {$unicode4_active} { - error "unescape_string. unexpected case slash during unicode4 not yet handled" - } elseif {$unicode8_active} { - error "unescape_string. unexpected case slash during unicode8 not yet handled" - } else { - # don't output anything (yet) - set slash_active 1 - } - } else { - if {$unicode4_active} { - if {[tcl::string::length $buffer4] < 4} { - append buffer4 $c - } - if {[tcl::string::length $buffer4] == 4} { - #we have a \uHHHH to test - set unicode4_active 0 - set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$unicode8_active} { - if {[tcl::string::length $buffer8] < 8} { - append buffer8 $c - } - if {[tcl::string::length $buffer8] == 8} { - #we have a \UHHHHHHHH to test - set unicode8_active 0 - set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$slash_active} { - set slash_active 0 - set ctest [tcl::string::map {{"} dq} $c] - switch -exact -- $ctest { - dq { - append buffer {"} - } - b - t - n - f - r { - append buffer [subst -nocommand -novariable "\\$c"] - } - e { - append buffer \x1b - } - u { - set unicode4_active 1 - set buffer4 "" - } - U { - set unicode8_active 1 - set buffer8 "" - } - default { - set slash_active 0 - #review - toml spec says all other escapes are reserved - #and if they are used TOML should produce an error. - #we leave detecting this for caller for now - REVIEW - append buffer "\\$c" - } - } - } else { - append buffer $c - } - } - } - #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" - if {$unicode4_active} { - error "End of string reached before complete unicode escape sequence \uHHHH" - } - if {$unicode8_active} { - error "End of string reached before complete unicode escape sequence \UHHHHHHHH" - } - if {$slash_active} { - append buffer "\\" - } - return $buffer - } - - #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) - #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, - #e.g squoted vs dquoted vs barekey. - proc normalize_key {rawkey} { - set c1 [tcl::string::index $rawkey 0] - set c2 [tcl::string::index $rawkey end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes allowed within it. - set key [tcl::string::range $rawkey 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - # - set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only - #e.g key could have mix of \UXXXXXXXX escapes and unicode chars - #or mix of \t and literal tabs. - #unescape to convert all to literal versions for comparison - set key [::tomlish::utils::unescape_string $keydata] - #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. - } else { - set key $rawkey - } - return $key - } - - proc string_to_slashu {string} { - set rv {} - foreach c [split $string {}] { - scan $c %c cdec - if {$cdec > 65535} { - append rv {\U} [format %.8X $cdec] - } else { - append rv {\u} [format %.4X $cdec] - } - } - return $rv - } - - #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. - #This is used for display purposes only (error msgs) - proc nonprintable_to_slashu {s} { - set res "" - foreach i [split $s ""] { - scan $i %c cdec - - set printable 0 - if {($cdec>31) && ($cdec<127)} { - set printable 1 - } - if {$printable} { - append res $i - } else { - if {$cdec > 65535} { - append res \\U[format %.8X $cdec] - } else { - append res \\u[format %.4X $cdec] - } - } - } - set res - } ;# initial version from tcl wiki RS - - #check if str is valid for use as a toml bare key - #Early toml versions? only allowed letters + underscore + dash - proc is_barekey1 {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } else { - set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters match the regexp - return 1 - } else { - return 0 - } - } - } - - #from toml.abnf in github.com/toml-lang/toml - #unquoted-key = 1*unquoted-key-char - #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ - #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions - #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block - #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon - #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics - #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators - #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols - #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation - #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank - #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space - #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - variable re_barekey - set ranges [list] - lappend ranges {a-zA-Z0-9\_\-} - lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions - lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block - lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon - lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics - lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators - lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols - lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation - lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank - lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space - lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - set re_barekey {^[} - foreach r $ranges { - append re_barekey $r - } - append re_barekey {]+$} - - proc is_barekey {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } - variable re_barekey - return [regexp $re_barekey $str] - } - - #test only that the characters in str are valid for the toml specified type 'integer'. - proc int_validchars1 {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - #add support for hex,octal,binary 0x.. 0o.. 0b... - proc int_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_int {str} { - set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - - # --------------------------------------- - #check for leading zeroes in non 0x 0b 0o - #first strip any +, - or _ (just for this test) - #(but still allowing 0 -0 +0) - set check [tcl::string::map {+ "" - "" _ ""} $str] - if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { - return 0 - } - # --------------------------------------- - - #check +,- only occur in the first position. (excludes also +++1 etc) - if {[tcl::string::last - $str] > 0} { - return 0 - } - if {[tcl::string::last + $str] > 0} { - return 0 - } - - #------------------------------------------- - #unclear if a 'digit' includes the type specifiers x b o - #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem - #to be likely to cause interop issues with other systems - #(e.g tcl allows 0b1_1 but not 0b_11) - #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) - #we still need to support earlier Tcl for now though. - - #first rule out any case with more than one underscore in a row - if {[regexp {__} $str]} { - return 0 - } - if {[string index $str 0] eq "_"} { - return 0 - } - set utest [string trimleft $str +-] - #test again for further trick like _+_0xFF - if {[string index $utest 0] eq "_"} { - return 0 - } - if {[string range $utest 0 1] in {0x 0b 0o}} { - set testnum [string range $utest 2 end] - } else { - set testnum $utest - #exclude also things like 0_x 0___b that snuck past our prefix test - if {![string is digit -strict [string map {_ ""} $testnum]]} { - return 0 - } - #assert - only digits and underscores in testnum - #still may have underscores at each end - } - #assert testnum is now the 'digits' portion of a , 0x 0b 0o number - #(+ and - already stripped) - #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below - if {[string length $testnum] != [string length [string trim $testnum _]]} { - #had non-inner underscores in 'digit' part - return 0 - } - #assert str only has solo inner underscores (if any) between 'digits' - #------------------------------------------- - - set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores - #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) - if {![tcl::string::is integer -strict $numeric_value]} { - return 0 - } - - - - #!todo - check bounds only based on some config value - #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) - #presumably very large numbers would have to be supplied in a toml file as strings. - #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max - #some question around implementations allowed to use lower values such as 2^31 on some systems? - if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { - return 0 - } - if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { - return 0 - } - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'float'. - proc float_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { - return 1 - } else { - #only allow lower case for these special values - as per Toml 1.0 spec - if {$str ni {inf +inf -inf nan +nan -nan}} { - return 0 - } else { - return 1 - } - } - } - - #note - Tcl's string is double will return true also for the subset of float values which are integers - #This function is to determine whether it matches the Toml float concept - so requires a . or e or E - proc is_float {str} { - #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) - if {$str in {inf +inf -inf nan +nan -nan}} { - return 1 - } - #doorcheck the basics for floatiness vs members of that rival gang - ints - if {![regexp {[.eE]} $str]} { - #could be an integer - which isn't specifically a float for Toml purposes. - return 0 - } - - - #patdown for any contraband chars - set matches [regexp -all {[eE0-9\_\-\+\.]} $str] - if {[tcl::string::length $str] != $matches} { - return 0 - } - - #all characters in legal range - - #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) - - #Early Toml spec also disallowed leading zeros in the exponent part(?) - #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) - #we allow leading zeros in exponents here. - - #Check for leading zeros in main part - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - set r {([0-9])*} - regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E - set z {([0])*} - regexp $z $intpart leadingzeros - if {[tcl::string::length $leadingzeros] > 1} { - return 0 - } - - #for floats, +,- may occur in multiple places - #e.g -2E-22 +3e34 - #!todo - check bounds ? - - #----------------------------------------- - if {[regexp {__} $str]} { - return 0 - } - if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { - return 0 - } - set utest [string trimleft $str +-] - #test again for further trick like _+_ - if {[string index $utest 0] eq "_"} { - return 0 - } - #----------------------------------------- - - #decimal point, if used must be surrounded by at least one digit on each side - #e.g 3.e+20 also illegal - set dposn [string first . $str] - if {$dposn > -1 } { - set d3 [string range $str $dposn-1 $dposn+1] - if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { - return 0 - } - } - #we've already eliminated leading/trailing underscores - #now ensure each inner underscore is surrounded by digits - if {[regexp {_[^0-9]|[^0-9]_} $str]} { - return 0 - } - - #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores - set check [tcl::string::map {_ ""} $str] - #string is double accepts inf nan +NaN etc. - if {![tcl::string::is double $check]} { - return 0 - } - - #All good - seems to be a toml-approved float and not an int. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'datetime'. - proc datetime_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_datepart {str} { - set matches [regexp -all {[0-9\-]} $str] - if {[tcl::string::length $str] != $matches} { - return 0 - } - #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) - if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { - return 0 - } - - return 1 - } - proc is_localdate {str} { - is_datepart $str - } - proc is_timepart {str} { - set numchars [tcl::string::length $str] - #timepart can have negative or positive offsets so - and + must be accepted - if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { - #todo - return 1 - } else { - return 0 - } - } - proc is_localtime {str} { - #time of day without any relation to a specific day or any offset or timezone - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\.:]} $str] == $numchars} { - #todo - return 1 - } else { - return 0 - } - } - - #review - proc is_datetime {str} { - #Essentially RFC3339 formatted date-time - but: - #1) allowing seconds to be omitted (:00 assumed) - #2) T may be replaced with a single space character TODO - parser support for space in datetime! - # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) - - #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does - #toml spec doesn't clarify - we will accept - - #e.g 1979-05-27 - #e.g 1979-05-27T00:32:00Z - #e.g 1979-05-27 00:32:00-07:00 - #e.g 1979-05-27 00:32:00+10:00 - #e.g 1979-05-27 00:32:00.999999-07:00 - - #review - #minimal datetimes? - # 2024 not ok - 2024T not accepted by tomlint why? - # 02:00 ok - # 02:00:00.5 ok - # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec - - #toml-lint.com accepts 2025-01 - - if {[string length $str] < 5} { - return 0 - } - - set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - if {[regexp -all {\ } $str] > 1} { - #only a single space is allowed. - return 0 - } - #If we get a space - it is only valid as a convience to represent the T separator - #we can normalize by converting to T here before more tests - set str [string map {" " T t T} $str] - #a further sanity check on T - if {[regexp -all {T} $str] > 1} { - return 0 - } - - #!todo - use full RFC 3339 parser? - #!todo - what if the value is 'time only'? - - if {[string first T $str] > -1} { - lassign [split $str T] datepart timepart - if {![is_datepart $datepart]} { - return 0 - } - if {![is_timepart $timepart]} { - return 0 - } - } else { - #either a datepart or a localtime - #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day - # without any relation to a specific day or any offset or timezone." - if {!([is_datepart $str] || [is_localtime $str])} { - return 0 - } - } - - - #Tcl's free-form clock scan (no -format option) is deprecated - # - #if {[catch {clock scan $datepart} err]} { - # puts stderr "tcl clock scan failed err:'$err'" - # return 0 - #} - - } else { - return 0 - } - return 1 - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] -} - -namespace eval tomlish::parse { - #*** !doctools - #[subsection {Namespace tomlish::parse}] - #[para] - #[list_begin definitions] - - #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. - #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: - # - e.g some kind of backtracking required if using an ABNF parser? - #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" - #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' - - #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? - - #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) - - - variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text - - variable state - # states: - # table-space, itable-space, array-space - # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # dquoted-key, squoted-key - # string-state, literal-state, multistring... - # - # notes: - # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack - - # - # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax - # - #stateMatrix defines for each state, actions to take for each possible token. - #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. - #dual-element actions are a push instruction and the name of the space to push on the stack. - # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) - - # -- --- --- --- --- --- - #token/state naming guide - # -- --- --- --- --- --- - #tokens : underscore separated or bare name e.g newline, start_quote, start_squote - #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence - #states : always contain at least one dash e.g err-state, table-space - #instructions - # -- --- --- --- --- --- - - - #stateMatrix dict of elements mapping current state to next state based on returned tokens - # current-state {token-encountered next-state ... } - # where next-state can be a 1 or 2 element list. - #If 2 element - the first item is an instruction (ucase) - #If 1 element - it is either a lowercase dashed state name or an ucase instruction - #e.g {PUSHSPACE } or POPSPACE or SAMESPACE - - - #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases - - variable stateMatrix - set stateMatrix [dict create] - #--------------------------------------------------------- - #WARNING - #The stateMatrix implementation here is currently messy. - #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. - #This means the state behaviour has to be reasoned about by looking at both in conjuction. - #--------------------------------------------------------- - - #xxx-space vs xxx-syntax inadequately documented - TODO - - #review - out of date? - # --------------------------------------------------------------------------------------------------------------# - # incomplete example of some state starting at table-space - # --------------------------------------------------------------------------------------------------------------# - # ( = -> keyval-value-expected) - # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) - # keyval-space (autotransition on push ^) - # table-space (barekey^) (startdquote -> dquoted-key ^) - # --------------------------------------------------------------------------------------------------------------# - - dict set stateMatrix\ - table-space { - bom "table-space"\ - whitespace "table-space"\ - newline "table-space"\ - barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ - dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXsingle_dquote "quoted-key"\ - XXXsingle_squote "squoted-key"\ - comment "table-space"\ - starttablename "tablename-state"\ - starttablearrayname "tablearrayname-state"\ - enddquote "err-state"\ - endsquote "err-state"\ - comma "err-state"\ - eof "end-state"\ - equal "err-state"\ - cr "err-lonecr"\ - } - - - - dict set stateMatrix\ - keyval-space {\ - whitespace "keyval-syntax"\ - equal "keyval-value-expected"\ - } - - # ' = ' portion of keyval - dict set stateMatrix\ - keyval-syntax {\ - whitespace "keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "keyval-value-expected"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #### - dict set stateMatrix\ - keyval-value-expected {\ - whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-tail" note ""}\ - literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ - string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ - single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ - triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} - - #2025 - no leading-squote-space - only trailing-squote-space. - - dict set stateMatrix\ - keyval-tail {\ - whitespace "keyval-tail"\ - newline "POPSPACE"\ - comment "keyval-tail"\ - eof "end-state"\ - } - - - #itable-space/ curly-syntax : itables - # x={y=1,} - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #we don't get single_squote etc here - instead we get the resulting squotedkey token - - - # ??? review - something like this - # - # x={y =1,} - dict set stateMatrix\ - itable-keyval-syntax {\ - whitespace {TOSTATE "itable-keyval-syntax"}\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal {TOSTATE "itable-keyval-value-expected"}\ - newline "err-state"\ - eof "err-state"\ - } - - # x={y=1} - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } - - dict set stateMatrix\ - itable-keyval-value-expected {\ - whitespace "itable-keyval-value-expected"\ - untyped_value {TOSTATE "itable-val-tail" note ""}\ - single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ - triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ - startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ - startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ - } - #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' - # review - # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} - - - - # x={y=1,z="x"} - #POPSPACE is transition from itable-keyval-space to parent itable-space - dict set stateMatrix\ - itable-val-tail {\ - whitespace "itable-val-tail"\ - endinlinetable "POPSPACE"\ - comma "POPSPACE"\ - newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - comment "itable-val-tail"\ - eof "err-state"\ - } - # XXXnewline "POPSPACE" - # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail - # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record - #e.g - # x = { - # j=1 - # #comment within dottedkey j record - # , # comment unattached - # #comment unattached - # k=2 , #comment unattached - # l=3 #comment within l record - # , m=4 - # #comment associated with m record - # - # #still associated with m record - # } - ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. - #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma - #so we cant do: j= 1 #comment for j1 , - # and have the trailing comma recognised. - # - # To associate: j= 1, #comment for j1 - # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? - # - # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma - # is 'associated' with the previous entry. - # - # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, - # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments - # (e.g reordering records within an itable) - #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] - #it is for keyval ie x.y.z = value - - #this is the state after dot - #we are expecting a complete key token or whitespace - #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) - dict set stateMatrix\ - dottedkey-space {\ - whitespace "dottedkey-space"\ - dotsep "err-state"\ - barekey "dottedkey-space-tail"\ - squotedkey "dottedkey-space-tail"\ - dquotedkey "dottedkey-space-tail"\ - newline "err-state"\ - comma "err-state"\ - comment "err-state"\ - equal "err-state"\ - } - #dottedkeyend "POPSPACE" - #equal "POPSPACE"\ - - #jmn 2025 - #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop - dict set stateMatrix\ - dottedkey-space-tail {\ - whitespace "dottedkey-space-tail" - dotsep "dottedkey-space" - equal "POPSPACE"\ - eof "err-state"\ - newline "err-state"\ - } - - #-------------------------------------------------------------------------- - #scratch area - #from_toml {x=1} - # barekey tok - # table-space PUSHSPACE keyval-space state keyval-syntax - # - - - #-------------------------------------------------------------------------- - - - #REVIEW - #toml spec looks like heading towards allowing newlines within inline tables - #https://github.com/toml-lang/toml/issues/781 - - #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. - #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table - - #JMN2025 - #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES - #We currently allow multiline ITABLES (also with comments) in the tokenizer. - #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? - - - #JMN REVIEW - #dict set stateMatrix\ - # array-space {\ - # whitespace "array-space"\ - # newline "array-space"\ - # untyped_value "SAMESPACE"\ - # startarray {PUSHSPACE "array-space"}\ - # endarray "POPSPACE"\ - # startinlinetable {PUSHSPACE itable-space}\ - # single_dquote "string-state"\ - # single_squote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ - # comma "array-space"\ - # comment "array-space"\ - # eof "err-state-array-space-got-eof"\ - # } - - ## array-space ## - set aspace [dict create] - dict set aspace whitespace "array-space" - dict set aspace newline "array-space" - #dict set aspace untyped_value "SAMESPACE" - dict set aspace untyped_value "array-syntax" - dict set aspace startarray {PUSHSPACE "array-space"} - dict set aspace endarray "POPSPACE" - dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} - dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} - dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} - dict set aspace startinlinetable {PUSHSPACE itable-space} - #dict set aspace comma "array-space" - dict set aspace comment "array-space" - dict set aspace eof "err-state-array-space-got-eof" - dict set stateMatrix array-space $aspace - - #when we pop from an inner array we get to array-syntax - #e.g {x=[[]] ??? - set tarntail [dict create] - dict set tarntail whitespace "tablearrayname-tail" - dict set tarntail newline "err-state" - dict set tarntail comment "err-state" - dict set tarntail eof "err-state" - dict set tarntail endtablename "tablearray-tail" - dict set stateMatrix tablearrayname-tail $tarntail - - #review - somewhat counterintuitive...? - # [(starttablearrayname) (endtablearrayname] - # [(starttablename) (endtablename)] - - # [[xxx]] ??? - set tartail [dict create] - dict set tartail whitespace "tablearray-tail" - dict set tartail newline "table-space" - dict set tartail comment "tablearray-tail" - dict set tartail eof "end-state" - dict set stateMatrix tablearray-tail $tartail - - - - - - - dict set stateMatrix\ - end-state {} - - set knowntokens [list] - set knownstates [list] - dict for {state transitions} $stateMatrix { - if {$state ni $knownstates} {lappend knownstates $state} - dict for {tok instructions} $transitions { - if {$tok ni $knowntokens} {lappend knowntokens $tok} - } - } - dict set stateMatrix nostate {} - foreach tok $knowntokens { - dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" - } - - - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #purpose - debugging? remove? - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #build a list of 'push triggers' from the stateMatrix - # ie tokens which can push a new space onto spacestack - set push_trigger_tokens [list] - tcl::dict::for {s transitions} $stateMatrix { - tcl::dict::for {token transition_to} $transitions { - set instruction [lindex $transition_to 0] - switch -exact -- $instruction { - PUSHSPACE - zeropoppushspace { - if {$token ni $push_trigger_tokens} { - lappend push_trigger_tokens $token - } - } - } - } - } - ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - - - - #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) - #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE - - #mainly for the -space states: - #redirect to another state $c based on a state transition from $whatever to $b - # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. - #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' - - #use dict set to add values so we can easily add/remove/comment lines - - #Push to, next - #default first states when we push to these spaces - variable spacePushTransitions [dict create] - dict set spacePushTransitions keyval-space keyval-syntax - dict set spacePushTransitions itable-keyval-space itable-keyval-syntax - dict set spacePushTransitions array-space array-space - dict set spacePushTransitions table-space tablename-state - #dict set spacePushTransitions #itable-space itable-space - - #Pop to, next - variable spacePopTransitions [dict create] - dict set spacePopTransitions array-space array-syntax - - - #itable-keyval-space itable-val-tail - #review - #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail - #leave it out and make the POPSPACE caller explicitly specify it - #keyval-space keyval-tail - - variable spaceSameTransitions [dict create] - #JMN test - #dict set spaceSameTransitions array-space array-syntax - - #itable-keyval-space itable-val-tail - - - variable state_list ;#reset every tomlish::decode::toml - - namespace export tomlish toml - namespace ensemble create - - #goNextState has various side-effects e.g pushes and pops spacestack - #REVIEW - setting nest and v elements here is ugly - #todo - make neater, more single-purpose? - proc goNextState {tokentype tok currentstate} { - variable state - variable nest - variable v - - set prevstate $currentstate - - - variable spacePopTransitions - variable spacePushTransitions - variable spaceSameTransitions - - variable last_space_action "none" - variable last_space_type "none" - variable state_list - - set result "" - set starttok "" - - if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { - set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] - ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" - switch -exact -- [lindex $transition_to 0] { - POPSPACE { - set popfromspace_info [spacestack peek] - set popfromspace_state [dict get $popfromspace_info state] - spacestack pop - set parent_info [spacestack peek] - set type [dict get $parent_info type] - set parentspace [dict get $parent_info state] - - set last_space_action "pop" - set last_space_type $type - - if {[dict exists $parent_info returnstate]} { - set next [dict get $parent_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" - } else { - ### - #review - do away with spacePopTransitions - which although useful to provide a default.. - # - involve error-prone configurations distant to the main state transition configuration in stateMatrix - if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { - set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" - } else { - set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" - } - } - set result $next - } - SAMESPACE { - set currentspace_info [spacestack peek] - ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" - set type [dict get $currentspace_info type] - set currentspace [dict get $currentspace_info state] - - if {[dict exists $currentspace_info returnstate]} { - set next [dict get $currentspace_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" - } else { - if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { - set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" - } else { - set next $currentspace - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" - } - } - set result $next - } - zeropoppushspace { - if {$nest > 0} { - #pop back down to the root level (table-space) - spacestack pop - set parentinfo [spacestack peek] - set type [dict get $parentinfo type] - set target [dict get $parentinfo state] - - set last_space_action "pop" - set last_space_type $type - - #----- - #standard pop - set parentlevel [expr {$nest -1}] - lappend v($parentlevel) [set v($nest)] - incr nest -1 - #----- - } - #re-entrancy - - #set next [list PUSHSPACE [lindex $transition_to 1]] - set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 - ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" - set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] - set result [dict get $transition_info newstate] - } - PUSHSPACE { - set original_target [dict get $transition_to PUSHSPACE] - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - if {[dict exists $transition_to starttok]} { - set starttok [dict get $transition_to starttok] - } - spacestack push [dict create type space state $original_target] - - set last_space_action "push" - set last_space_type "space" - - if {[dict exists $transition_to state]} { - #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) - set next [dict get $transition_to state] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" - } else { - #puts $::tomlish::parse::spacePushTransitions - if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { - set next [dict get $::tomlish::parse::spacePushTransitions $original_target] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " - } else { - set next $original_target - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" - } - } - set result $next - } - TOSTATE { - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - set result [dict get $transition_to TOSTATE] - } - default { - #simplified version of TOSTATE - set result [lindex $transition_to 0] ;#ignore everything but first word - } - } - } else { - ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" - set result "nostate" - } - lappend state_list [list tokentype $tokentype from $currentstate to $result] - set state $result - ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " - return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] - } - - proc report_line {{line ""}} { - variable linenum - variable is_parsing - if {$is_parsing} { - if {$line eq ""} { - set line $linenum - } - return "Line Number: $line" - } else { - #not in the middle of parsing tomlish text - return nothing. - return "" - } - } - - #produce a *slightly* more readable string rep of the nest for puts etc. - proc nest_pretty1 {list} { - set prettier "{" - - foreach el $list { - if { [lindex $el 0] eq "NEWLINE"} { - append prettier "[list $el]\n" - } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { - append prettier [nest_pretty1 $el] - } else { - append prettier "[list $el] " - } - } - append prettier "}" - return $prettier - } - - proc set_tokenType {t} { - variable tokenType - variable tokenType_list - if {![info exists tokenType]} { - set tokenType "" - } - lappend tokenType_list $t - set tokenType $t - } - - proc switch_tokenType {t} { - variable tokenType - variable tokenType_list - lset tokenType_list end $t - set tokenType $t - } - - proc get_tokenType {} { - variable tokenType - return $tokenType - } - - - proc get_token_waiting {} { - variable token_waiting - return [lindex $token_waiting 0] - } - proc clear_token_waiting {} { - variable token_waiting - set token_waiting [list] - } - - #token_waiting is a list - but our standard case is to have only one - #in certain circumstances such as near eof we may have 2 - #the set_token_waiting function only allows setting when there is not already one waiting. - #we want to catch cases of inadvertently trying to set multiple - # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. - proc set_token_waiting {args} { - if {[llength $args] %2 != 0} { - error "tomlish set_token_waiting must have args of form: type value complete 0|1" - } - variable token_waiting - - if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { - #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another - #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context - #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it - set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" - append err \n " - cannot add token_waiting: $args" - error $err - #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] - #set token_waiting [list] - } - - set waiting [dict create] - dict for {k v} $args { - switch -exact $k { - type - complete { - dict set waiting $k $v - } - value { - dict set waiting tok $v - } - startindex { - dict set waiting startindex $v - } - default { - error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" - } - } - } - if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { - error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" - } - if {![llength $token_waiting]} { - set token_waiting [list $waiting] - } else { - #an extra sanity-check that we don't have more than just the eof.. - if {[llength $token_waiting] > 1} { - set err "tomlish Unexpected. Existing token_waiting count > 1.\n" - foreach tw $token_waiting { - append err " $tw" \n - } - append err " - cannot add token_waiting: $waiting" - error $err - } - #last entry must be a waiting eof - set token_waiting [list $waiting [lindex $token_waiting end]] - } - return - } - - #returns 0 or 1 - #tomlish::parse::tok - #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag - # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) - # - interactive use? - - proc tok {s} { - variable nest - variable v - variable i - variable tok - variable type ;#character type - variable state ;#FSM - - - variable tokenType - variable tokenType_list - - - variable endToken - - variable lastChar - - variable braceCount - variable bracketCount - - - #------------------------------ - #Previous run found another (presumably single-char) token - #The normal case is for there to be only one dict in the list - #multiple is an exception - primarily for eof - variable token_waiting - if {[llength $token_waiting]} { - set waiting [lindex $token_waiting 0] - - set tokenType [dict get $waiting type] - set tok [dict get $waiting tok] - #todo: dict get $token_waiting complete - set token_waiting [lrange $token_waiting 1 end] - return 1 - } - #------------------------------ - - set resultlist [list] - set sLen [tcl::string::length $s] - - set slash_active 0 - set quote 0 - set c "" - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $s [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $s $i] - set cindex $i - set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] - tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" - #puts "got char $c during tokenType '$tokenType'" - incr i ;#must incr here because we do returns inside the loop - - switch -exact -- $ctest { - # { - set had_slash $slash_active - set slash_active 0 - - if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - #for multiliteral, multistring - data and/or end - incr i -1 - return 1 - } - _start_squote_sequence { - #pseudo token beginning with underscore - never returned to state machine - review - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i [tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" - } - whitespace { - # hash marks end of whitespace token - #do a return for the whitespace, set token_waiting - #set_token_waiting type comment value "" complete 1 - incr i -1 ;#leave comment for next run - return 1 - } - untyped_value { - #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? - #we will accept a comment marker as an immediate terminator of the untyped_value. - incr i -1 - return 1 - } - starttablename - starttablearrayname { - #fix! - error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out - append tok $c - } - default { - #dquotedkey, string,literal, multistring - append tok $c - } - } - } else { - switch -- $state { - multistring-space { - set_tokenType stringpart - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "#" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "#" - } - default { - #start of token if we're not in a token - set_tokenType comment - set tok "" ;#The hash is not part of the comment data - } - } - } - } - lc { - #left curly brace - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i [tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - #*bare* tablename can only contain letters,digits underscores - error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #valid in quoted parts - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\{" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - switch -exact -- $state { - itable-keyval-value-expected - keyval-value-expected { - #switch last key to tablename?? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - array-space - array-syntax { - #nested anonymous inline table - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\{" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\{" - } - default { - error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" - } - } - } - - } - rc { - #right curly brace - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - tablename { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endinlinetable value "" complete 1 startindex $cindex - return 1 - } - starttablearrayname - tablearrayname { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - default { - #end any other token - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-space { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname-state problem" - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - #invalid - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-val-tail { - set_tokenType "endinlinetable" - set tok "" - #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 - incr i -1 - return 1 - } - itable-keyval-syntax { - error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\}" - } - multiliteral-space { - set_tokenType "literalpart" ; #review - set tok "\}" - } - default { - #JMN2024b keyval-tail? - error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" - } - } - } - - } - lb { - #left square bracket - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename { - #change the tokenType - switch_tokenType "starttablearrayname" - set tok "" ;#no output into the tomlish list for this token - #any following whitespace is part of the tablearrayname, so return now - return 1 - } - tablename - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - #append tok "\\[" - append tok {\[} - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - #invalid at this point - state machine should disallow: - # table -> starttablearrayname - # tablearray -> starttablearrayname - set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "\[" - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startarray" - set tok "\[" - return 1 - } - array-space - array-syntax { - #nested array? - set_tokenType "startarray" - set tok "\[" - return 1 - #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" - } - table-space { - #table name - #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray - #note that a starttablearrayname token may contain whitespace between the brackets - # e.g \[ \[ - set_tokenType "starttablename" - set tok "" ;#there is no output into the tomlish list for this token - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\[" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\[" - } - itable-space { - #handle state just to give specific error msg - error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" - } - default { - error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - rb { - #right square bracket - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - #???? - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } else { - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablename value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - default { - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - array-syntax - array-space { - #invalid - but allow parser statemachine to report it. - set_tokenType "endarray" - set tok "\]" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endarray" - set tok "\]" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endtablename" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname problem" - set_tokenType "endtablearray" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-tail { - #[[xxx] - set_tokenType "endtablename" - #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\]" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\]" - } - default { - error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - bsl { - #backslash - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - #end whitespace token - incr i -1 ;#reprocess bsl in next run - return 1 - } else { - error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" - } - } - literal - literalpart - squotedkey { - #never need to set slash_active true when in single quoted tokens - append tok "\\" - set slash_active 0 - } - string - dquotedkey - comment { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - stringpart { - if {$slash_active} { - #assert - quotes empty - or we wouldn't have slash_active - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - starttablename - starttablearrayname { - error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" - } - tablename - tablearrayname { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - barekey { - error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" - } - default { - error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$slash_active} { - set_tokenType "stringpart" - set tok "\\\\" - set slash_active 0 - } else { - set slash_active 1 - } - } - multiliteral-space { - #nothing can be escaped in multiliteral-space - not even squotes (?) review - set_tokenType "literalpart" - set tok "\\" - } - default { - error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" - } - } - } - } - sq { - #single quote - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote { - #for within multiliteral - #short tentative_accum_squote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 - #return tok with value ''''' - return 1 - } - } - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multiliteral - #switch? - append tok $c - set_tokenType triple_squote - return 1 - } - default { - #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled - #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 squotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" - } - } - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq - return 1 - } - literal { - #slash_active always false - #terminate the literal - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - literalpart { - #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) - #todo - # idea: end this literalpart (possibly 'temporarily') - # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack - # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing - return 1 - } - XXXitablesquotedkey { - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - squotedkey { - ### - #set_token_waiting type endsquote value "'" complete 1 - return 1 - } - starttablename - starttablearrayname { - #!!! - incr i -1 - return 1 - } - tablename - tablearrayname { - append tok $c - } - barekey { - #barekeys now support all sorts of unicode letter/number chars for other cultures - #but not punctuation - not even for those of Irish heritage who don't object - #to the anglicised form of some names. - # o'shenanigan seems to not be a legal barekey - #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. - error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" - } - default { - append tok $c - } - } - } else { - switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading squote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_squote token or triple_squote token - #It currently doesn't trigger double_squote token - #(handle '' same as 'x' ie produce a single_squote and go into processing literal) - #review - producing double_squote for empty literal may be slightly more efficient. - #This token is not used to handle squote sequences *within* a multiliteral - set_tokenType "_start_squote_sequence" - set tok "'" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote - set tok "'" - return 1 - } - table-space - itable-space { - #tests: squotedkey.test squotedkey_itable.test - set_tokenType "squotedkey" - set tok "" - } - XXXtable-space - XXXitable-space { - #future - could there be multiline keys? MLLKEY, MLBKEY ? - #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) - #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key - #where key is simple-key or dotted-key - no MLL or MLB components - #the spec states solution for arbitrary binary data is application specific involving encodings - #such as hex, base64 - set_tokenType "_start_squote_sequence" - set tok "'" - return 1 - } - tablename-state { - #first char in tablename-state/tablearrayname-state - set_tokenType "tablename" - append tok "'" - } - tablearrayname-state { - set_tokenType "tablearrayname" - append tok "'" - } - literal-state { - #shouldn't get here? review - tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType "literal" - incr -1 - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - dottedkey-space { - set_tokenType "squotedkey" - } - default { - error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" - } - } - } - - } - dq { - #double quote - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - tentative_accum_dquote { - #within multistring - #short tentative_accum_dquote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 - #return tok with value """"" - return 1 - } - } - _start_dquote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multistring - #switch? - append tok $c - set_tokenType triple_dquote - return 1 - } - default { - #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled - #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 dquotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" - } - } - } - literal - literalpart { - append tok $c - } - string { - if {$had_slash} { - append tok "\\" $c - } else { - #unescaped quote always terminates a string - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex - return 1 - } - } - stringpart { - #sub element of multistring - if {$had_slash} { - append tok "\\" $c - } else { - incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing - return 1 - } - } - whitespace { - #assert: had_slash will only ever be true in multistring-space - if {$had_slash} { - incr i -2 - return 1 - } else { - #end whitespace token - throw dq back for reprocessing - incr i -1 - return 1 - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - XXXdquotedkey { - if {$had_slash} { - append tok "\\" - append tok $c - } else { - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex - return 1 - } - } - dquotedkey { - ### - if {$had_slash} { - append tok "\\" - append tok $c - } else { - #set_token_waiting type enddquote value {"} complete 1 - return 1 - } - } - squotedkey { - append tok $c - } - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - incr i -1 ;## - return 1 - } - default { - error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - #$slash_active not relevant when no tokenType - #token is string only if we're expecting a value at this point - switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading dquote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_dquote token or triple_dquote token - #It currently doesn't trigger double_dquote token - #(handle "" same as "x" ie produce a single_dquote and go into processing string) - #review - producing double_dquote for empty string may be slightly more efficient. - #This token is not used to handle dquote sequences once *within* a multistring - set_tokenType "_start_dquote_sequence" - set tok {"} - } - multistring-space { - if {$had_slash} { - set_tokenType "stringpart" - set tok "\\\"" - } else { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote - set tok {"} - return 1 - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\"" - } - table-space - itable-space { - set_tokenType "dquotedkey" - set tok "" - } - dottedkey-space { - set_tokenType dquotedkey - set tok "" - - #only if complex keys become a thing - #set_tokenType dquote_seq_begin - #set tok $c - } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - default { - error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" - } - } - } - } - = { - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - #assertion had_slash 0 - append tok $c - } - string - comment - dquotedkey { - #for these tokenTypes an = is just data. - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type equal value = complete 1 startindex $cindex - return 1 - } - } - barekey { - #set_token_waiting type equal value = complete 1 - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out - append tok $c - } - default { - error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok = - } - multiliteral-space { - set_tokenType "literalpart" - set tok "=" - } - dottedkey-space { - set_tokenType "equal" - set tok "=" - return 1 - } - default { - set_tokenType "equal" - set tok = - return 1 - } - } - } - } - cr { - #REVIEW! - # \r carriage return - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #we have received a double cr - ::tomlish::log::warn "double cr - will generate cr token. needs testing" - set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - append tok $c - } - literalpart { - #part of MLL string (multi-line literal string) - #we need to split out crlf as a separate NEWLINE to be consistent - ::tomlish::log::warn "literalpart ended by cr - needs testing" - #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space - incr i -1 - return 1 - } - stringpart { - #stringpart is a part of MLB string (multi-line basic string) - #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #could in theory be valid in quoted part of name - #review - might be better just to disallow here - append tok $c - } - whitespace { - #it should technically be part of whitespace if not followed by lf - #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW - incr i -1 - return 1 - } - untyped_value { - incr i -1 - return 1 - } - default { - #!todo - error out if cr inappropriate for tokenType - append tok $c - } - } - } else { - #lf may be appended if next - #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) - set_tokenType "newline" - set tok cr - } - } - lf { - # \n newline - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #review - #this lf is the trailing part of a crlf - append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - #multiliteral or multistring - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' - #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - literalpart { - #we allow newlines - but store them within the multiliteral as their own element - #This is a legitimate end to the literalpart - but not the whole multiliteral - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - stringpart { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - starttablename - tablename - tablearrayname - starttablearrayname { - error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" - } - default { - #newline ends all other tokens. - #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) - #note for whitespace: - # we will use the convention that \n terminates the current whitespace even if whitespace follows - # ie whitespace is split into separate whitespace tokens at each newline - - #puts "-------------- newline lf during tokenType $tokenType" - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "newline" - set tok lf - return 1 - } - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "newline" - set tok "lf" - return 1 - } - default { - #ignore slash? error? - set_tokenType "newline" - set tok lf - return 1 - } - } - #if {$had_slash} { - # #CONT directly before newline - allows strings_5_byteequivalent test to pass - # set_tokenType "continuation" - # set tok "\\" - # incr i -1 - # return 1 - #} else { - # set_tokenType newline - # set tok lf - # return 1 - #} - } - } - , { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok , - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - #stringpart can have up to 2 quotes too - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type comma value "," complete 1 startindex $cindex - return 1 - } - } - default { - set_token_waiting type comma value "," complete 1 startindex $cindex - if {$had_slash} {append tok "\\"} - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "literalpart" - set tok "," - } - default { - set_tokenType "comma" - set tok "," - return 1 - } - } - } - } - . { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - untyped_value { - if {$had_slash} {append tok "\\"} - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - switch -exact -- $state { - multistring-space { - #review - if {$had_slash} { - incr i -2 - } else { - incr i -1 - } - return 1 - } - xxxdottedkey-space { - incr i -1 - return 1 - } - dottedkey-space-tail { - incr i -1 - return 1 - } - default { - error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" - } - } - } - starttablename - starttablearrayname { - #This would correspond to an empty table name - error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #subtable - split later - review - append tok $c - } - barekey { - #e.g x.y = 1 - #we need to transition the barekey to become a structured table name ??? review - #x is the tablename y is the key - set_token_waiting type dotsep value "." complete 1 startindex $cindex - return 1 - } - default { - error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" - #set_token_waiting type period value . complete 1 - #return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "." - } - multiliteral-space { - set_tokenType "literalpart" - set tok "." - } - XXXdottedkey-space { - ### obs? - set_tokenType "dotsep" - set tok "." - return 1 - } - dottedkey-space-tail { - ### - set_tokenType "dotsep" - set tok "." - return 1 - } - default { - set_tokenType "untyped_value" - set tok "." - } - } - } - - } - " " { - if {[tcl::string::length $tokenType]} { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #todo had_slash - emit token or error - #whitespace is a terminator for bare keys - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - comment { - if {$had_slash} { - append tok "\\" - } - append tok $c - } - string - dquotedkey { - if {$had_slash} { append tok "\\" } - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART xxx WS " " - incr i -1 - return 1 - } - } - literal - literalpart - squotedkey { - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - append tok $c - } else { - append tok $c - } - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "whitespace" - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - if {$had_slash} { - error "tomlish unexpected backslash [tomlish::parse::report_line]" - } - set_tokenType "whitespace" - append tok $c - } - } - } - } - tab { - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #whitespace is a terminator for bare keys - incr i -1 - #set_token_waiting type whitespace value $c complete 1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - squotedkey { - append tok $c - } - dquotedkey - string - comment - whitespace { - #REVIEW - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - incr i -1 - return 1 - } - } - literal - literalpart { - append tok $c - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - if {$slash_active} { - set slash_active 0 - } - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "whitespace" - append tok $c - } - } - } - } - bom { - #BOM (Byte Order Mark) - ignored by token consumer - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - #assert - tok will be one or two squotes only - #A toml literal probably isn't allowed to contain this - #but we will parse and let the validator sort it out. - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart { - append tok $c - } - string - stringpart { - append tok $c - } - default { - #state machine will generally not have entry to accept bom - let it crash - set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - multistring-space { - set_tokenType "stringpart" - set tok $c - } - default { - set_tokenType "bom" - set tok "\uFEFF" - return 1 - } - } - } - } - default { - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - #review - incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. - return 1 - } - } - barekey { - if {[tomlish::utils::is_barekey $c]} { - append tok $c - } else { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" - } - } - starttablename - starttablearrayname { - incr i -1 - #allow statemachine to set context for subsequent chars - return 1 - } - stringpart { - append tok $c - } - default { - #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname - append tok $c - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - table-space - itable-space { - #if no currently active token - assume another key value pair - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - multistring-space { - set_tokenType "stringpart" - if {$had_slash} { - set tok \\$c - } else { - set tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - tablename-state { - set_tokenType "tablename" - set tok $c - } - tablearrayname-state { - set_tokenType "tablearrayname" - set tok $c - } - dottedkey-space { - set_tokenType barekey - set tok $c - } - default { - #todo - something like ansistring VIEW to show control chars? - set cshow [string map [list \t tab \v vt] $c] - tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" - set_tokenType "untyped_value" - set tok $c - } - } - } - } - } - - } - - #run out of characters (eof) - if {[tcl::string::length $tokenType]} { - #check for invalid ending tokens - #if {$state eq "err-state"} { - # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" - #} - switch -exact -- $tokenType { - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open literal - error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" - } - 2 { - set_tokenType "literal" - set tok "" - return 1 - - ##review - #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] - #set_tokenType "literal" - #set tok "" - #return 1 - } - } - } - _start_dquote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open string - error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" - } - 2 { - set_tokenType "string" - set tok "" - return 1 - } - } - } - newline { - #The only newline token that has still not been returned should have a tok value of "cr" - puts "tomlish eof reached - with incomplete newline token '$tok'" - if {$tok eq "cr"} { - #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. - #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) - #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values - # ie as it's own token. - switch_tokenType "cr" - return 1 - } else { - #should be unreachable - error "tomlish eof reached - with invalid newline token. value: $tok" - } - } - } - set_token_waiting type eof value eof complete 1 startindex $i ;#review - return 1 - } else { - ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" - set tokenType "eof" - set tok "eof" - } - return 0 - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] -} - -namespace eval tomlish::dict { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - - proc is_tomlish_typeval {d} { - #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} - #as a sanity check we need to avoid mistaking user data that happens to match same form - #consider x.y={type="spud",value="blah"} - #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. - #check the length of the type as a quick way to see it's a tag - not something else masqerading. - expr {[dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} - } - proc is_tomlish_typeval2 {d} { - upvar ::tomlish::tags tags - expr {[lindex $d 0] eq "type" && [lindex $d 1] in $tags} - } - proc last_tomltype_posn {d} { - set last_simple -1 - set dictposn [expr {[dict size $d] -1}] - foreach k [lreverse [dict keys $d]] { - set dval [dict get $d $k] - if {[is_tomlish_typeval $dval]} { - set last_simple $dictposn - break - } - incr dictposn -1 - } - return $last_simple - } - - - #review - proc name_from_tablestack {tablestack} { - set name "" - foreach tinfo [lrange $tablestack 1 end] { - lassign $tinfo type namepart - switch -- $type { - T { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - I { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - default { - #end at first break in the leading sequence of T & I tablenames - break - } - } - } - return $name - } - - proc _show_tablenames {tablenames_info} { - append msg \n "tablenames_info:" \n - dict for {tkey tinfo} $tablenames_info { - append msg " " "table: $tkey" \n - dict for {field finfo} $tinfo { - append msg " " "$field $finfo" \n - } - } - return $msg - } -} - - -tcl::namespace::eval tomlish::app { - #*** !doctools - #[subsection {Namespace tomlish::app}] - #[para] - #[list_begin definitions] - - tcl::namespace::eval argdoc { - proc test_suites {} { - if {[package provide test::tomlish] eq ""} { - return [list] - } - return [test::tomlish::SUITES] - } - } - - proc decoder {args} { - #*** !doctools - #[call app::[fun decoder] [arg args]] - #[para] read toml on stdin until EOF - #[para] on error - returns non-zero exit code and writes error on stderr - #[para] on success - returns zero exit code and writes JSON encoding of the data on stdout - #[para] This decoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - #fconfigure stdin -encoding utf-8 - fconfigure stdin -translation binary - #Just slurp it all - presumably we are not handling massive amounts of data on stdin. - # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. - if {[catch { - set toml [read stdin] - }]} { - exit 2 ;#read error - } - try { - set j [::tomlish::toml_to_json $toml] - } on error {em} { - puts stderr "decoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $j - exit 0 - } - - proc encoder {args} { - #*** !doctools - #[call app::[fun encoder] [arg args]] - #[para] read JSON on stdin until EOF - #[para] return non-zero exitcode if JSON data cannot be converted to a valid TOML representation - #[para] return zero exitcode and TOML data on stdout if JSON data can be converted. - #[para] This encoder is intended to be compatible with toml-test - - set opts [dict merge [dict create] $args] - fconfigure stdin -translation binary - if {[catch { - set json [read stdin] - }]} { - exit 2 ;#read error - } - try { - set toml [::tomlish::json_to_toml $json] - } on error {em} { - puts stderr "encoding failed: '$em'" - exit 1 - } - puts -nonewline stdout $toml - exit 0 - } - - package require punk::args - punk::args::define { - @dynamic - @id -id ::tomlish::app::test - @cmd -name tomlish::app::test - @leaders - @opts -any 1 - -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} - @values -min 0 -max 0 - } - proc test {args} { - package require test::tomlish - set argd [punk::args::parse $args withid ::tomlish::app::test] - set opts [dict get $argd opts] - set opt_suite [dict get $opts -suite] - test::tomlish::SUITE $opt_suite - #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { - # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" - # exit 1 - #} - set run_opts [dict remove $opts -suite] - test::tomlish::RUN - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::app ---}] -} - -proc ::tomlish::appnames {} { - set applist [list] - foreach cmd [info commands ::tomlish::app::*] { - lappend applist [namespace tail $cmd] - } - return $applist -} - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Secondary API namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish::lib { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - #*** !doctools - #[subsection {Namespace tomlish::lib}] - #[para] Secondary functions that are part of the API - #[list_begin definitions] - - #proc utility1 {p1 args} { - # #*** !doctools - # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 - #} - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -if {[info exists ::argc] && $::argc > 0} { - puts stderr "argc: $::argc args: $::argv" - set arglist $::argv - # -------------- - #make sure any dependant packages that are sourced don't get any commandline args - set ::argv {} - set ::argc 0 - # -------------- - package require punk::args - punk::args::define { - @dynamic - @id -id tomlish::cmdline - @cmd -name tomlish -help\ - "toml encoder/decoder written in Tcl" - @opts -any 1 - -help -type none -help\ - "display usage" - -app -choices {${[tomlish::appnames]}} - } - set argd [punk::args::parse $arglist withid tomlish::cmdline] - lassign [dict values $argd] leaders opts values received - if {[dict exists $received -help] || ![dict exists $received -app]} { - #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" - puts stderr [punk::args::usage tomlish::cmdline] - exit 0 - } - - set app [dict get $opts -app] - set appnames [tomlish::appnames] - set app_opts [dict remove $opts -app] - tomlish::app::$app {*}$app_opts - - #set opts [dict create] - #set opts [dict merge $opts $::argv] - - #set opts_understood [list -app ] - #if {"-app" in [dict keys $opts]} { - # #Don't vet the remaining opts - as they are interpreted by each app - #} else { - # foreach key [dict keys $opts] { - # if {$key ni $opts_understood} { - # puts stderr "Option '$key' not understood" - # exit 1 - # } - # } - #} - #if {[dict exists $opts -app]} { - # set app [dict get $opts -app] - # set appnames [tomlish::appnames] - # if {$app ni $appnames} { - # puts stderr "app '[dict get $opts -app]' not found. Available apps: $appnames" - # exit 1 - # } - # tomlish::app::$app {*}$opts - #} -} - -## Ready -package provide tomlish [namespace eval tomlish { - variable pkg tomlish - variable version - set version 1.1.4 -}] -return - -#*** !doctools -#[manpage_end] - diff --git a/src/vendormodules/tomlish-1.1.5.tm b/src/vendormodules/tomlish-1.1.5.tm deleted file mode 100644 index a8f33d38..00000000 --- a/src/vendormodules/tomlish-1.1.5.tm +++ /dev/null @@ -1,6991 +0,0 @@ -# -*- tcl -*- -# Maintenance Instruction: leave the 999999.xxx.x as is and use 'pmix make' or src/make.tcl to update from -buildversion.txt -# -# Please consider using a BSD or MIT style license for greatest compatibility with the Tcl ecosystem. -# Code using preferred Tcl licenses can be eligible for inclusion in Tcllib, Tklib and the punk package repository. -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# (C) 2024 -# -# @@ Meta Begin -# Application tomlish 1.1.5 -# Meta platform tcl -# Meta license -# @@ Meta End - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# doctools header -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -#*** !doctools -#[manpage_begin tomlish_module_tomlish 0 1.1.5] -#[copyright "2024"] -#[titledesc {tomlish toml parser}] [comment {-- Name section and table of contents description --}] -#[moddesc {tomlish}] [comment {-- Description at end of page heading --}] -#[require tomlish] -#[keywords module parsing toml configuration] -#[description] -#[para] tomlish is an intermediate representation of toml data in a tree structure (tagged lists representing type information) -#[para] The design goals are for tomlish to be whitespace and comment preserving ie byte-for byte preservation during roundtrips from toml to tomlish and back to toml -#[para] The tomlish representation can then be converted to a Tcl dict structure or to other formats such as json, -#[para] although these other formats are generally unlikely to retain whitespace or comments -#[para] The other formats also won't preserve roundtripability e.g \t and a literal tab coming from a toml file will be indistinguishable. -#[para] A further goal is to allow at least a useful subset of in-place editing operations which also preserve whitespace and comments. -#[para] e.g leaf key value editing, and table reordering/sorting, key-renaming at any level, key insertions/deletions -#[para] The API for editing (tomldoc object?) may require explicit setting of type if accessing an existing key -#[para] e.g setting a key that already exists and is a different type (especially if nested structure such as a table or array) -#[para] will need a -type option (-force ?) to force overriding with another type such as an int. - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section Overview] -#[para] overview of tomlish -#[subsection Concepts] -#[para] - - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -## Requirements -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[subsection dependencies] -#[para] packages used by tomlish -#[list_begin itemized] - -package require Tcl 8.6- -package require struct::stack -package require logger - -#*** !doctools -#[item] [package {Tcl 8.6-}] -#[item] [package {struct::stack}] - -#limit ourselves to clear, destroy, peek, pop, push, rotate, or size (e.g v 1.3 does not implement 'get') - - -#*** !doctools -#[list_end] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -#*** !doctools -#[section API] - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Base namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish { - namespace export {[a-z]*}; # Convention: export all lowercase - variable types - - #default interp recursionlimit of 1000 is insufficient to pass 1000 deep nested structures as in certain toml tests. - #e.g https://github.com/iarna/toml-spec-tests/tree/latest/values - #1000 seems deep for a 'configuration' format - but toml sometimes used for other serialisation purposes. - #todo - review - set existing_recursionlimit [interp recursionlimit {}] - if {$existing_recursionlimit < 5000} { - interp recursionlimit {} 5000 - } - - #IDEAS: - # since get_toml produces tomlish with whitespace/comments intact: - # tomldoc object - allow (at least basic?) editing of toml whilst preserving comments & whitespace - # - setKey (set leaf only to value) how to specify type? -type option? - whole array vs index into arrays and further nested objects? - option for raw toml additions? - # - separate addKey?? - # - deleteKey (delete leaf) - # - deleteTable (delete table - if only has leaves? - option to delete with child tables?) - # - set/add Table? - position in doc based on existing tables/subtables? - - #The tomlish intermediate representation allows things such as sorting the toml document by table name or other re-ordering of tables - - # because the tables include subkeys, comments and newlines within their structure - those elements all come along with it nicely during reordering. - #The same goes for the first newline following a keyval e.g x=1 \ny=2\n\n - #The newline is part of the keyval structure so makes reordering easier - #example from_toml "a=1\nb=2\n\n\n" - # 0 = TOMLISH - # 1 = KEY a = {INT 1} {NEWLINE lf} - # 2 = NEWLINE lf - # 3 = KEY b = {INT 2} {NEWLINE lf} - # 4 = NEWLINE lf - # 5 = NEWLINE lf - - #This reordering idea is complicated by the nature of tablearrays - especially as a table header references last tablearrayname, - # and duplicate table headers are allowed in that context. - #e.g - #[[fruits]] - # name="apple" - # [fruits.metadata] - # id=1 - # - #[unrelated1] - # - #[[fruits]] - # name="pear" - # - #[unrelated2] - # silly="ordering" - # - #[fruits.metadata] - #id=2 - #The TABLEARRAY record can't be completely selfcontained on the default parsing mechanism - because it is legal (though not recommended) to have unrelated tables in between. - #If we were to 'insert' later related records (such as the 2nd [fruits.metadata] above) into the TABLEARRAY structure - then, even though it might produce 'nicer' toml, - # we would lose roundtripability toml->tomlish->toml - # ----------------------------------------------------- - #REVIEW - #todo - some sort of 'normalize'/'grouping' function on tomlish that at least makes records self-contained, and perhaps then (optionally) reorders resulting records sensibly. - #such a function on the tomlish may work - although it would be unwise to duplicate the validation aspects of to_dict - #The most practical way might be to use to_dict followed by from_dict - but that would lose comment info and formatting. - #In the above example - The decision by the toml author to put [unrelated1] between related tablearrays should be respected, - #but the positioning of [unrelated2] between a tablearray and one of its contained tables is suspect. - #Both [fruits.metadata] table records should theoretically be added as children to their corresponding [[fruits]] tablearray record in the tomlish. (just as their name keys are) - # ----------------------------------------------------- - - - - #ARRAY is analogous to a Tcl list - #TABLE is analogous to a Tcl dict - #WS = inline whitespace - #KEY = bare key and value - #DQKEY = double quoted key and value - #SQKEY = single quoted key and value - #ITABLE = inline table (*can* be anonymous table) - # inline table values immediately create a table with the opening brace - # inline tables are fully defined between their braces, as are dotted-key subtables defined within - # No additional subtables or arrays of tables may be defined within an inline table after the ending brace - they must be entirely self-contained - - set tags [list TOMLISH ARRAY TABLE ITABLE TABLEARRAY WS NEWLINE COMMENT DOTTEDKEY KEY DQKEY SQKEY STRING STRINGPART MULTISTRING LITERAL LITERALPART MULTILITERAL INT FLOAT BOOL DATETIME] - #removed - ANONTABLE - #tomlish v1.0 should accept arbitrary 64-bit signed ints (from -2^63 to 2^63-1) - #we will restrict to this range for compatibility for now - although Tcl can handle larger (arbitrarily so?) - #todo - configurable - allow empty string for 'unlimited' - set min_int -9223372036854775808 ;#-2^63 - set max_int +9223372036854775807 ;#2^63-1 - - proc Dolog {lvl txt} { - #return "$lvl -- $txt" - set msg "[clock format [clock seconds] -format "%Y-%m-%dT%H:%M:%S"] tomlish '$txt'" - puts stderr $msg - } - logger::initNamespace ::tomlish - foreach lvl [logger::levels] { - interp alias {} tomlish_log_$lvl {} ::tomlish::Dolog $lvl - log::logproc $lvl tomlish_log_$lvl - } - - #*** !doctools - #[subsection {Namespace tomlish}] - #[para] Core API functions for tomlish - #[list_begin definitions] - - proc tags {} { - return $::tomlish::tags - } - - #helper function for to_dict - proc _get_keyval_value {keyval_element} { - log::notice ">>> _get_keyval_value from '$keyval_element'<<<" - #find the value (or 2 values if space separated datetime - and stitch back into one) - # 3 is the earliest index at which the value could occur (depending on whitespace) - if {[lindex $keyval_element 2] ne "="} { - error "tomlish _get_keyval_value keyval_element doesn't seem to be a properly structured { = } list" - } - - #review - if {[uplevel 1 [list info exists tablenames_info]]} { - upvar tablenames_info tablenames_info - } else { - set tablenames_info [dict create] ;#keys are lists {parenttable subtable etc} corresponding to parenttable.subtable.etc - } - set sublist [lrange $keyval_element 2 end] - - set values [list] - set value_posns [list] - set posn 0 - foreach sub $sublist { - #note that a barekey/dquotedkey won't occur directly inside a barekey/dquotedkey - switch -exact -- [lindex $sub 0] { - STRING - LITERAL - MULTISTRING - MULTILITERAL - INT - FLOAT - BOOL - DATETIME - TABLE - ARRAY - ITABLE { - lappend values $sub - lappend value_posns $posn - } - default {} - } - incr posn - } - switch -- [llength $values] { - 0 { - error "tomlish Failed to find value element in KEY. '$keyval_element'" - } - 1 { - lassign [lindex $values 0] type value - } - 2 { - #validate than exactly single space was between the two values - lassign $value_posns p1 p2 - if {$p2 != $p1 +2} { - #sanity check - #can probably only get here through manual manipulation of the tomlish list to an unprocessable form - error "tomlish KEY appears to have more than one part - but not separated by whitespace - invalid '$keyval_element'" - } - set between_token [lindex $sublist $p1+1] - if {[lindex $between_token 1] ne " "} { - error "tomlish KEY in 2 parts is not separated by a single space - cannot consider for datetime '$keyval_element'" - } - lassign [lindex $values 0] type_d1 value_d1 - lassign [lindex $values 1] type_d2 value_d2 - if {$type_d1 ne "DATETIME" || $type_d2 ne "DATETIME"} { - error "tomlish KEY in 2 parts does not appear to be datetime '$keyval_element'" - } - set type DATETIME - set value "${value_d1}T${value_d2}" - } - default { - error "tomlish Found multiple value elements in KEY, expected one. (or 2 for space-separated datetime) '$keyval_element'" - } - } - - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - #simple (non-container, no-substitution) datatype - set result [list type $type value $value] - } - STRING - STRINGPART { - set result [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL - LITERALPART { - #REVIEW - set result [list type $type value $value] - } - TABLE { - #invalid? - error "tomlish _get_keyval_value invalid to have type TABLE on rhs of =" - } - ITABLE { - #This one should not be returned as a type value structure! - # - set result [::tomlish::to_dict [ list [lindex $values 0] ]] - } - ARRAY { - #we need to recurse to get the corresponding dict for the contained item(s) - #pass in the whole [lindex $values 0] (type val) - not just the $value! - set prev_tablenames_info $tablenames_info - set tablenames_info [dict create] - set result [list type $type value [ ::tomlish::to_dict [ list [lindex $values 0] ] ]] - set tablenames_info $prev_tablenames_info - } - MULTISTRING - MULTILITERAL { - #review - mapping these to STRING might make some conversions harder? - #if we keep the MULTI - we know we have to look for newlines for example when converting to json - #without specific types we'd have to check every STRING - and lose info about how best to map chars within it - set result [list type $type value [ ::tomlish::to_dict [ list [lindex $values 0] ] ]] - } - default { - error "tomlish Unexpected value type '$type' found in keyval '$keyval_element'" - } - } - return $result - } - - - - #to_dict is a *basic* programmatic datastructure for accessing the data. - # produce a dictionary of keys and values from a tomlish tagged list. - # to_dict is primarily for reading toml data. - #Extraneous (not within quoted sections) whitespace and comments are not preserved in this structure, - # so a roundtrip from toml to this datastructure and back to toml will lose whitespace formatting and comments. - # creating/changing toml values can be done directly on a tomlish list if preserving (or adding) formatting/comments is desired. - #A separate package 'tomlish::object' may be needed to allow easier programmatic creating/updating/deleting of data elements whilst preserving (or adding or selectively deleting/editing) such formatting. - # - - #within an ARRAY, we store a list of items such as plain dicts (possibly empty) and {type value } for simple types - #(ARRAYS can be mixed type) - #This means our dict structure should have only ARRAY and simple types which need to be in {type value } form - #A dict within an array encodeded as a type ITABLE value should also parse - but is the unpreferred form - REVIEW test? - - #Namespacing? - #ie note the difference: - #[Data] - #temp = { cpu = 79.5, case = 72.0} - # versus - #[Data] - #temps = [{cpu = 79.5, case = 72.0}] - proc to_dict {tomlish} { - package require dictn - - #keep track of which tablenames have already been directly defined, - # so we can raise an error to satisfy the toml rule: 'You cannot define any key or table more than once. Doing so is invalid' - #Note that [a] and then [a.b] is ok if there are no subkey conflicts - so we are only tracking complete tablenames here. - #we don't error out just because a previous tablename segment has already appeared. - - #Declaring, Creating, and Defining Tables - #https://github.com/toml-lang/toml/issues/795 - #(update - only Creating and Defining are relevant terminology) - - #review - #tablenames_info keys type created, defined, createdby, definedby, closedby ??? review keys - # [tname] = header_table [[tname]] = header_tablearray - - #consider the following 2 which are legal: - #[table] #'table' created, defined=open type header_table - #x.y = 3 - #[table.x.z] #'table' defined=closed closedby={header_table table.x.z}, 'table.x' created, 'table.x.z' created defined=open definedby={header_table table.x.z} - #k= 22 - # #'table.x.z' defined=closed closedby={eof eof} - - #equivalent datastructure - - #[table] #'table' created, defined=open definedby={header_table table} - #[table.x] #'table' defined=closed closedby={header_table table.x}, 'table.x' created defined=open definedby={header_table table.x} - #y = 3 - #[table.x.z] #'table.x' defined=closed closedby={header_table table.x.z}, 'table.x.z' created defined=open definedby={header_table table.x.z} - #k=22 - - #illegal - #[table] #'table' created and defined=open - #x.y = 3 #'table.x' created first keyval pair defined=open definedby={keyval x.y = 3} - #[table.x.y.z] #'table' defined=closed, 'table.x' closed because parent 'table' closed?, 'table.x.y' cannot be created - #k = 22 - # - ## - we would fail on encountering table.x.y because only table and table.x are effectively tables - but that table.x is closed should be detected (?) - - #illegal - #[table] - #x.y = {p=3} - #[table.x.y.z] - #k = 22 - ## we should fail because y is an inline table which is closed to further entries - - #note: it is not safe to compare normalized tablenames using join! - # e.g a.'b.c'.d is not the same as a.b.c.d - # instead compare {a b.c d} with {a b c d} - # Here is an example where the number of keys is the same, but they must be compared as a list, not a joined string. - #'a.b'.'c.d.e' vs 'a.b.c'.'d.e' - #we need to normalize the tablenames seen so that {"x\ty"} matches {"xy"} - - - - if {[uplevel 1 [list info exists tablenames_info]]} { - upvar tablenames_info tablenames_info - } else { - set tablenames_info [dict create] ;#keyed on tablepath each of which is a list such as {config subgroup etc} (corresponding to config.subgroup.etc) - } - - - log::info "---> to_dict processing '$tomlish'<<<" - set items $tomlish - - foreach lst $items { - if {[lindex $lst 0] ni $::tomlish::tags} { - error "supplied string does not appear to be toml parsed into a tomlish tagged list. Run tomlish::decode::toml on the raw toml data to produce a tomlish list" - } - } - - if {[lindex $tomlish 0] eq "TOMLISH"} { - #ignore TOMLISH tag at beginning - set items [lrange $tomlish 1 end] - } - - set datastructure [dict create] - foreach item $items { - set tag [lindex $item 0] - #puts "...> item:'$item' tag:'$tag'" - switch -exact -- $tag { - KEY - DQKEY - SQKEY { - log::debug "---> to_dict item: processing $tag: $item" - set key [lindex $item 1] - if {$tag eq "DQKEY"} { - set key [::tomlish::utils::unescape_string $key] - } - #!todo - normalize key. (may be quoted/doublequoted) - - if {[dict exists $datastructure $key]} { - error "Duplicate key '$key'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #lassign [_get_keyval_value $item] type val - set keyval_dict [_get_keyval_value $item] - dict set datastructure $key $keyval_dict - } - DOTTEDKEY { - log::debug "---> to_dict item processing $tag: $item" - set dkey_info [tomlish::to_dict::get_dottedkey_info $item] - set dotted_key_hierarchy [dict get $dkey_info keys] - - #a.b.c = 1 - #table_key_hierarchy -> a b - #tleaf -> c - if {[llength $dotted_key_hierarchy] == 0} { - #empty?? probably invalid. review - #This is different to '' = 1 or ''.'' = 1 which have lengths 1 and 2 respectively - error "DOTTED key has no parts - invalid? '$item'" - } elseif {[llength $dotted_key_hierarchy] == 1} { - #dottedkey is only a key - no table component - set table_hierarchy [list] - set tleaf [lindex $dotted_key_hierarchy 0] - } else { - set table_hierarchy [lrange $dotted_key_hierarchy 0 end-1] - set tleaf [lindex $dotted_key_hierarchy end] - } - - #ensure empty tables are still represented in the datastructure - #review - this seems unnecessary? - set pathkeys [list] - foreach k $table_hierarchy { - lappend pathkeys $k - if {![dict exists $datastructure {*}$pathkeys]} { - dict set datastructure {*}$pathkeys [list] - } else { - tomlish::log::notice "to_dict datastructure at key path $pathkeys already had data: [dict get $datastructure {*}$pathkeys]" - } - } - #review? - if {[dict exists $datastructure {*}$table_hierarchy $tleaf]} { - error "Duplicate key '$table_hierarchy $tleaf'. The key already exists at this level in the toml data. The toml data is not valid." - } - - #JMN test 2025 - if {[llength $table_hierarchy]} { - dictn incr tablenames_info [list $table_hierarchy seencount] - } - - set keyval_dict [_get_keyval_value $item] - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - set t [list {*}$table_hierarchy $tleaf] - dictn incr tablenames_info [list $t seencount] - dictn set tablenames_info [list $t closed] 1 - - #review - item is an ITABLE - we recurse here without datastructure context :/ - #overwriting keys? todo ? - dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - } else { - dict set datastructure {*}$table_hierarchy $tleaf $keyval_dict - } - - } - TABLEARRAY { - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - - set NEST_DICT [dict create] ;#first blush attempt at referencing supertable tablearrays - set tablearrayname [lindex $item 1] - log::debug "---> to_dict processing item TABLENAME (name: $tablearrayname): $item" - set norm_segments [::tomlish::to_dict::tablename_split $tablearrayname true] ;#true to normalize - #we expect repeated tablearray entries - each adding a sub-object to the value, which is an array/list. - #tablearrayname is likely to appear multiple times - so unlike a TABLE we don't check for 'defined' for the full name as an indicator of a problem - set supertable [list] - ############## - # [[a.b.c.d]] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this path doesn't yet exist - if {[dict exists $datastructure {*}$supertable]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of tablearray name $tablearrayname already has data but doesn't appear to be a table - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_supertable_keycollision - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } else { - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - #REVIEW!! - # what happens with from_toml {[[a.b.c]]} {[[a]]} ??? - dictn set tablenames_info [list $supertable type] header_table ;#how do we know it's not going to be a tablearray? - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } else { - #supertable has already been created - and maybe defined - but even if defined we can add subtables unless it is of type itable - #but if it's a tablearray - we need to point to the most 'recently defined table element of the array' - #(another way of saying last member of that array)?? - set supertype [dictn get $tablenames_info [list $supertable type]] - if {$supertype eq "header_tablearray"} { - puts stdout "todict!!! TABLEARRAY nesting required for supertable [join $supertable .]" - puts stdout "todict!!! todo.." - #how to do multilevel nesting?? - set EXISTING_SUPERARRAY_ELEMENTS [dict get $datastructure {*}$supertable value] - dict set NEST_DICT $supertable $EXISTING_SUPERARRAY_ELEMENTS - puts stdout "todict!!! supertable '[join $supertable .]' elements $EXISTING_SUPERARRAY_ELEMENTS" - } - } - } - # - if {![dictn exists $tablenames_info [list $norm_segments type]]} { - #first encounter of this tablearrayname - if {[dict exists $datastructure {*}$norm_segments]} { - #e.g from_toml {a=1} {[[a]]} - set msg "Cannot create tablearray name $tablearrayname. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablearray_direct_keycollision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #no collision - we can create the tablearray and the array in the datastructure - dictn set tablenames_info [list $norm_segments type] header_tablearray - dict set datastructure {*}$norm_segments [list type ARRAY value {}] - set ARRAY_ELEMENTS [list] - } else { - #we have a table - but is it a tablearray? - set ttype [dictn get $tablenames_info [list $norm_segments type]] - #we use a header_unknown type for previous 'created' only tables - - if {$ttype eq "header_unknown"} { - dictn set tablenames_info [list $norm_segments type] header_tablearray - set ttype header_tablearray - #assert - must not be 'defined' - #we have seen it before as a supertable ie 'created' only - #Not 'defined' but could still have subtables - treat it as a dict - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments] - } else { - if {$ttype ne "header_tablearray"} { - #header_table or itable - switch -- $ttype { - itable {set ttypename itable} - header_table {set ttypename table} - default {error "unrecognised type - expected header_table or itable"} - } - set msg "tablearray name $tablearrayname already appears to be already created as '$ttypename' not tablearray - invalid?" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #EXISTING tablearray - #add to array - #error "add_to_array not implemented" - #{type ARRAY value } - set ARRAY_ELEMENTS [dict get $datastructure {*}$norm_segments value] - } - } - - - set object [dict create] ;#array context equivalent of 'datastructure' - set objectnames_info [dict create] ;#array contex equivalent of tablenames_info - - #add to ARRAY_ELEMENTS and write back in to datastructure. - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #MAINTENANCE: temp copy from TABLE - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - #set supertable $norm_segments - set supertable [list] ;#disconnect from main structure - each array element is a new context for key paths! - foreach normkey $dotparents { - lappend supertable $normkey - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - #our dotted key is attempting to define a table - if {![dictn exists $tablenames_info [list $dottedpath type]]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dictn set tablenames_info [list $dottedpath type] dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dictn get $tablenames_info [list $dottedpath type]] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list $dottedpath defined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } - #assert - dottedkey represents a key val pair that can be added - - - if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey '$dottedkeyname' kv: $keyval_dict" - #dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict - #wrong - #TODO!!!!!!!!!!!!! - #lappend ARRAY_ELEMENTS [list $dottedkeyname $keyval_dict] - dict set object $dottedkeyname $keyval_dict - - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] - #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] - - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW - - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 - } - - } - NEWLINE - COMMENT - WS { - #ignore - } - TABLE { - #we should be able to process tablearray subtables either as part of the tablearray record, or independently. - #(or even a mixture of both, although that is somewhat an edge case) - #[[fruit]] - #x=1 - # [fruit.metadata] - # [fruit.otherdata] - - #when processing a dict destined for the above - the tomlish generator (e.g from_dict) - #should create as 1 or 3 records (but could create 2 records if there was an unrelated table in between the subtables) - #choices: all in tablearray record, tablearray + 1 or 2 table records. - # - #We are going the other way here - so we just need to realize the list of tables 'belonging' to this tablearray might not be complete. - # - #the subtable names must be prefixed with the tablearray - we should validate that for any contained TABLE records - - } - default { - error "Sub element of type '$type' not understood in tablearray context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" - } - } - } - - #todo? - ##end of TABLE record - equivalent of EOF or next header - close off the dottedtables - #foreach dtablepath $dottedtables_defined { - # dictn set tablename_info [list $dtablepath defined] closed - #} - - if {[dict size $NEST_DICT]} { - puts "reintegrate?? $NEST_DICT" - #todo - more - what if multiple in hierarchy? - dict for {superpath existing_elements} $NEST_DICT { - #objects stored directly as dicts in ARRAY value - set lastd [lindex $existing_elements end] - #insufficient.. - #dict set lastd [lindex $norm_segments end] [list type ITABLE value $object] - dict set lastd [lindex $norm_segments end] $object - #set lastd [dict merge $lastd $object] - lset existing_elements end $lastd - dict set datastructure {*}$superpath [list type ARRAY value $existing_elements] - } - } else { - #lappend ARRAY_ELEMENTS [list type ITABLE value $object] - lappend ARRAY_ELEMENTS $object - dict set datastructure {*}$norm_segments [list type ARRAY value $ARRAY_ELEMENTS] - } - } - TABLE { - set tablename [lindex $item 1] - set dottedtables_defined [list] ;#for closing off at end by setting 'defined' - #As our TABLE record contains all it's child DOTTEDKEY records - this should be equivalent to setting them as defined at EOF or next header. - - log::debug "---> to_dict processing item TABLE (name: $tablename): $item" - set norm_segments [::tomlish::to_dict::tablename_split $tablename true] ;#true to normalize - - set T_DEFINED [dictn getdef $tablenames_info [list $norm_segments defined] NULL] - if {$T_DEFINED ni [list NULL header_tablearray]} { - #our tablename e.g [a.b.c.d] declares a space to 'define' subkeys - but there has already been a definition space for this path - set msg "Table name $tablename has already been directly defined in the toml data. Invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE TABLEREDEFINED} $msg - } - - - set name_segments [::tomlish::to_dict::tablename_split $tablename 0] ;#unnormalized e.g ['a'."b".c.d] -> 'a' "b" c d - #results of tablename_split 0 are 'raw' - ie some segments may be enclosed in single or double quotes. - - - set supertable [list] - ############## - # [a.b.c.d] - # norm_segments = {a b c d} - #check a {a b} {a b c} <---- supertables of a.b.c.d - ############## - foreach normseg [lrange $norm_segments 0 end-1] { - lappend supertable $normseg - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this path doesn't yet exist - if {[dict exists $datastructure {*}$supertable]} { - #There is data though - so it must have been created as a keyval - set msg "Supertable [join $supertable .] of table name $tablename already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - #we also don't know whether it's a table or a tablearray - dictn set tablenames_info [list $supertable type] header_unknown - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } else { - #supertable has already been created - and may be defined - but even if defined we can add subtables unless it is of type itable - - } - } - #table [a.b.c.d] hasn't been defined - but may have been 'created' already by a longer tablename - # - or may have existing data from a keyval - if {![dictn exists $tablenames_info [list $norm_segments type]]} { - if {[dict exists $datastructure {*}$norm_segments]} { - #e.g from_toml {a=1} {[a]} - set msg "Cannot create table name $tablename. Key already has data but key doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - #test: datastructure_tablename_keyval_collision_error - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #no data or previously created table - dictn set tablenames_info [list $norm_segments type] header_table - - #We are 'defining' this table's keys and values here (even if empty) - dict set datastructure {*}$norm_segments [list] ;#ensure table still represented in datastructure even if we add no keyvals here - } - dictn set tablenames_info [list $norm_segments defined] open - log::debug ">>> to_dict >>>>>>>>>>>>>>>>> normalized table key hierarchy : $norm_segments" - - #now add the contained elements - foreach element [lrange $item 2 end] { - set type [lindex $element 0] - log::debug "----> todict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - #convert to function: data needed? element, norm_segments refs to: tablename_info datastructure ??? - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - #e.g1 keys {x.y y} keys_raw {'x.y' "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - #e.g2 keys {x.y y} keys_raw {{"x.y"} "a\tb"} keys {x.y {a b}} (keys_raw has escape, keys has literal tab char) - - #[a.b] - #t1.t2.dottedtable.k = "val" - #we have already checked supertables a {a b} - #We need to check {a b t1} & {a b t2} ('creation' only) - #and then 'dottedtable' is 'defined' while k is an ordinary key in dottedtable - - #note we also get here as a 'dottedkey' with a simple - #[a.b] - #k = "val" - - set all_dotted_keys [dict get $dkey_info keys] - set dottedkeyname [join $all_dotted_keys .] - #obsolete - set nonleaf_keys [lrange $all_dotted_keys 0 end-1] ;#may be empty - - if {[llength $all_dotted_keys] > 1} { - #dottedtable.k=1 - #tX.dottedtable.k=1 - #etc - - set defines_a_table 1 - #Wrap in a list so we can detect 'null' equivalent. - #We can't use empty string as that's a valid dotted key segment - set dottedtable_bag [list [lindex $all_dotted_keys end-1]] - set dotparents [lrange $all_dotted_keys 0 end-2] - } else { - #basic case - not really a 'dotted' key - #a = 1 - set defines_a_table 0 - set dottedtable_bag [list] ;#empty bag - set dotparents [list] - } - #assert dottedtable_bag only ever holds 0 or 1 elements - set leaf_key [lindex $all_dotted_keys end] - - #we've already tested the table keys that got us here.. but not the dottedkey segments (if any) prior to dottedtable & leaf_key - set supertable $norm_segments - foreach normkey $dotparents { - lappend supertable $normkey - if {![dictn exists $tablenames_info [list $supertable type]]} { - #supertable with this combined path (table norm_segments plus parts of dottedkey) not yet 'created' - if {[dict exists $datastructure {*}$supertable]} { - #There is data so it must have been created as a keyval - set msg "Supertable [join $supertable .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #here we 'create' it, but it's not being 'defined' ie we're not setting keyvals for it here - dictn set tablenames_info [list $supertable type] unknown_table ;#REVIEW - #ensure empty tables are still represented in the datastructure - dict set datastructure {*}$supertable [list] - } - } - if {[llength $dottedtable_bag] == 1} { - set dottedtable [lindex $dottedtable_bag 0] - set dottedpath [list {*}$norm_segments {*}$dotparents $dottedtable] ;#full path to dottedtable - #our dotted key is attempting to define a table - if {![dictn exists $tablenames_info [list $dottedpath type]]} { - #first one - but check datastructure for collisions - if {[dict exists $datastructure {*}$dottedpath]} { - set msg "Supertable [join $dottedpath .] of dotted key $dottedkeyname already has data but doesn't appear to be a table (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - #raise a specific type of error for tests to check - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - #'create' the table - dictn set tablenames_info [list $dottedpath type] dottedkey_table - #don't actually set 'defined' here.. use the end of TABLE record to close them off by looking at this list - lappend dottedtables_defined $dottedpath - # - } else { - #exists - but might be from another dottedkey within the current header section - #the table is open for adding keys until the next 'header' section ([tablename] / [[tablearray]]) - #check for 'defined' closed (or just existence) - if {[dictn get $tablenames_info [list $dottedpath type]] eq "dottedkey_table"} { - #right type - but make sure it's from this header section - i.e defined not set - set definedstate [dictn getdef $tablenames_info [list $dottedpath defined] NULL] - if {$definedstate ne "NULL"} { - #collision with some other dottedkey - set msg "Table $dottedpath represented by dottedkey $dottedkeyname has been 'defined' elsewhere (keycollision) - invalid" - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - } - } - } - #assert - dottedkey represents a key val pair that can be added - - - if {[dict exists $datastructure {*}$norm_segments {*}$all_dotted_keys]} { - set msg "Duplicate key '$norm_segments $all_dotted_keys'. The key already exists at this level in the toml data. The toml data is not valid." - append msg \n [tomlish::dict::_show_tablenames $tablenames_info] - return -code error -errorcode {TOMLISH STRUCTURE KEYCOLLISION} $msg - } - - set keyval_dict [_get_keyval_value $element] - #keyval_dict is either a {type value } - #or the result from parsing an arbitrary dict from an inline table - which could theoretically look the same at the topmost level - #punk::dict::is_tomlish_typeval can distinguish - puts stdout "to_dict>>> TABLE/dottedkey $dottedkeyname kv: $keyval_dict" - dict set datastructure {*}$norm_segments {*}$all_dotted_keys $keyval_dict - - #remove ? - if {![tomlish::dict::is_tomlish_typeval $keyval_dict]} { - #the value is either empty or or a dict structure with arbitrary (from-user-data) toplevel keys - # inner structure will contain {type value } if all leaves are not empty ITABLES - set tkey [list {*}$norm_segments {*}$all_dotted_keys] - #lappend tablenames_info [list {*}$norm_segments {*}$dkeys $leaf_key] - - #by not creating a tablenames_info record - we effectively make it closed anyway? - #it should be detected as a key - #is there any need to store tablenames_info for it?? - #REVIEW - - ##TODO - update? - #dictn incr tablenames_info [list $tkey seencount] - ##if the keyval_dict is not a simple type x value y - then it's an inline table ? - ##if so - we should add the path to the leaf_key as a closed table too - as it's not allowed to have more entries added. - #dictn set tablenames_info [list $tkey closed] 1 - } - - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in table context. Expected only DOTTEDKEY,NEWLINE,COMMENT,WS" - } - } - } - - #end of TABLE record - equivalent of EOF or next header - close off the dottedtables - foreach dtablepath $dottedtables_defined { - dictn set tablename_info [list $dtablepath defined] closed - } - - - #review??? - #now make sure we add an empty value if there were no contained elements! - #!todo. - } - ITABLE { - #SEP??? - set datastructure [list] - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - DOTTEDKEY { - set dkey_info [tomlish::to_dict::get_dottedkey_info $element] - set dotted_key_hierarchy [dict get $dkey_info keys] - set leaf_key [lindex $dotted_key_hierarchy end] - set dkeys [lrange $dotted_key_hierarchy 0 end-1] - - #ensure empty keys are still represented in the datastructure - set table_keys [list] ;#We don't know the context - next level up will have to check for key collisions? - set test_keys $table_keys - foreach k $dkeys { - lappend test_keys $k - if {![dict exists $datastructure {*}$test_keys]} { - dict set datastructure {*}$test_keys [list] - } else { - tomlish::log::notice "to_dict datastructure at (DOTTEDKEY) subkey $test_keys already had data: [dict get $datastructure {*}$test_keys]" - } - } - - if {[dict exists $datastructure {*}$table_keys {*}$dkeys $leaf_key]} { - error "Duplicate key '$table_keys $dkeys $leaf_key'. The key already exists at this level in the toml data. The toml data is not valid." - } - set keyval_dict [_get_keyval_value $element] - dict set datastructure {*}$table_keys {*}$dkeys $leaf_key $keyval_dict - } - NEWLINE - COMMENT - WS { - #ignore - } - default { - error "Sub element of type '$type' not understood in ITABLE context. Expected only KEY,DQKEY,SQKEY,NEWLINE,COMMENT,WS" - } - } - } - } - ARRAY { - #arrays in toml are allowed to contain mixtures of types - set datastructure [list] - log::debug "--> processing array: $item" - - foreach element [lrange $item 1 end] { - set type [lindex $element 0] - log::debug "----> tododict processing $tag subitem $type processing contained element $element" - switch -exact -- $type { - INT - FLOAT - BOOL - DATETIME { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - STRING { - set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::utils::unescape_string $value]] - } - LITERAL { - set value [lindex $element 1] - lappend datastructure [list type $type value $value] - } - ITABLE { - #anonymous table - #lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - lappend datastructure [::tomlish::to_dict [list $element]] ;#store itables within arrays as raw dicts (possibly empty) - } - TABLE { - #invalid? shouldn't be output from from_dict - but could manually be constructed as such? review - #doesn't make sense as table needs a name? - #take as synonym for ITABLE? - error "to_dict TABLE within array unexpected" - } - ARRAY - MULTISTRING - MULTILITERAL { - #set value [lindex $element 1] - lappend datastructure [list type $type value [::tomlish::to_dict [list $element]]] - } - WS - SEP - NEWLINE - COMMENT { - #ignore whitespace, commas, newlines and comments - } - default { - error "Unexpected value type '$type' found in array" - } - } - } - } - MULTILITERAL { - #triple squoted string - #first newline stripped only if it is the very first element - #(ie *immediately* following the opening delims) - #All whitespace other than newlines is within LITERALPARTS - # ------------------------------------------------------------------------- - #todo - consider extension to toml to allow indent-aware multiline literals - # how - propose as issue in toml github? Use different delim? e.g ^^^ ? - #e.g - # xxx=?'''abc - # def - # etc - # ''' - # - we would like to trimleft each line to the column following the opening delim - # ------------------------------------------------------------------------- - - log::debug "---> todict processing multiliteral: $item" - set parts [lrange $item 1 end] - if {[lindex $parts 0 0] eq "NEWLINE"} { - set parts [lrange $parts 1 end] ;#skip it - } - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - switch -exact -- $type { - LITERALPART { - append stringvalue [lindex $element 1] - } - NEWLINE { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - MULTISTRING { - #triple dquoted string - log::debug "---> to_dict processing multistring: $item" - set stringvalue "" - set idx 0 - set parts [lrange $item 1 end] - for {set idx 0} {$idx < [llength $parts]} {incr idx} { - set element [lindex $parts $idx] - set type [lindex $element 0] - #We use STRINGPART in the tomlish representation as a distinct element to STRING - which would imply wrapping quotes to be reinserted - switch -exact -- $type { - STRING { - #todo - do away with STRING ? - #we don't build MULTISTRINGS containing STRING - but should we accept it? - tomlish::log::warn "double quoting a STRING found in MULTISTRING - should be STRINGPART?" - append stringvalue "\"[::tomlish::utils::unescape_string [lindex $element 1]]\"" - } - STRINGPART { - append stringvalue [::tomlish::utils::unescape_string [lindex $element 1]] - } - CONT { - #When the last non-whitespace character on a line is an unescaped backslash, - #it will be trimmed along with all whitespace (including newlines) up to the next non-whitespace character or closing delimiter - # review - we allow some whitespace in stringpart elements - can a stringpart ever be all whitespace? - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - append stringvalue "\\" ;#add the sep - } else { - #skip over ws without emitting - set idx [llength $parts] - } - } else { - set parts_til_nl [lrange $parts 0 $next_nl-1] - set non_ws [lsearch -index 0 -start $idx+1 -not $parts_til_nl WS] - if {$non_ws >= 0} { - append stringvalue "\\" - } else { - #skip over ws on this line - set idx $next_nl - #then have to check each subsequent line until we get to first non-whitespace - set trimming 1 - while {$trimming && $idx < [llength $parts]} { - set next_nl [lsearch -index 0 -start $idx+1 $parts NEWLINE] - if {$next_nl == -1} { - #last line - set non_ws [lsearch -index 0 -start $idx+1 -not $parts WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - } else { - set idx [llength $parts] - } - set trimming 0 - } else { - set non_ws [lsearch -index 0 -start $idx+1 -not [lrange $parts 0 $next_nl-1] WS] - if {$non_ws >= 0} { - set idx [expr {$non_ws -1}] - set trimming 0 - } else { - set idx $next_nl - #keep trimming - } - } - } - } - } - } - NEWLINE { - #if newline is first element - it is not part of the data of a multistring - if {$idx > 0} { - set val [lindex $element 1] - if {$val eq "nl"} { - append stringvalue \n - } else { - append stringvalue \r\n - } - } - } - WS { - append stringvalue [lindex $element 1] - } - default { - error "Unexpected value type '$type' found in multistring" - } - } - } - set datastructure $stringvalue - } - WS - COMMENT - NEWLINE { - #ignore - } - default { - error "Unexpected tag '$tag' in Tomlish list '$tomlish'" - } - } - } - return $datastructure - } - - - proc _from_dictval_tomltype {parents tablestack keys typeval} { - set type [dict get $typeval type] - set val [dict get $typeval value] - switch -- $type { - ARRAY { - set subitems [list] - foreach item $val { - lappend subitems [_from_dictval [list {*}$parents ARRAY] $tablestack $keys $item] SEP - } - if {[lindex $subitems end] eq "SEP"} { - set subitems [lrange $subitems 0 end-1] - } - return [list ARRAY {*}$subitems] - } - ITABLE { - if {$val eq ""} { - return ITABLE - } else { - return [_from_dictval [list {*}$parents ITABLE] $tablestack $keys $val] - } - } - MULTISTRING { - #value is a raw string that isn't encoded as tomlish - #create a valid toml snippet with the raw value and decode it to the proper tomlish MULTISTRING format - #We need to convert controls in $val to escape sequences - except for newlines - # - #consider an *option* to reformat for long lines? (perhaps overcomplex - byte equiv - but may fold in ugly places) - #we could use a line-length limit to decide when to put in a "line ending backslash" - #and even format it with a reasonable indent so that proper CONT and WS entries are made (?) REVIEW - # - #TODO - set tomlpart "x=\"\"\"\\\n" - append tomlpart $val "\"\"\"" - set tomlish [tomlish::decode::toml $tomlpart] - #e.g if val = " etc\nblah" - #TOMLISH {DOTTEDKEY {{KEY x}} = {MULTISTRING CONT {NEWLINE LF} {WS { }} {STRINGPART etc} {NEWLINE lf} {STRINGPART blah} } } - #lindex 1 3 is the MULTISTRING tomlish list - return [lindex $tomlish 1 3] - } - MULTILITERAL { - #MLL string can contain newlines - but still no control chars - #todo - validate - set tomlpart "x='''\n" - append tomlpart $val ''' - set tomlish [tomlish::decode::toml $tomlpart] - return [lindex $tomlish 1 3] - } - LITERAL { - #from v1.0 spec - "Control characters other than tab are not permitted in a literal string" - #(This rules out raw ANSI SGR - which is somewhat restrictive - but perhaps justified for a config format - # as copy-pasting ansi to a config value is probably not always wise, and it's not something that can be - # easily input via a text editor. ANSI can go in Basic strings using the \e escape if that's accepted v1.1?) - #we could choose to change the type to another format here when encountering invalid chars - but that seems - #like too much magic. We elect to error out and require the dict to have valid data for the types it specifies. - if {[string first ' $val] >=0} { - error "_from_dictval_tomltype error: single quote found in LITERAL - cannot encode dict to TOML-VALID TOMLISH" - } - #detect control chars other than tab - #for this we can use rawstring_to_Bstring_with_escaped_controls - even though this isn't a Bstring - #we are just using the map to detect a difference. - set testval [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val] - if {$testval ne $val} { - #some escaping would have to be done if this value was destined for a Bstring... - #therefor this string has controls and isn't suitable for a LITERAL according to the specs. - error "_from_dictval_tomltype error: control chars (other than tab) found in LITERAL value - cannot encode dict to TOML-VALID TOMLISH" - } - return [list LITERAL $val] - } - STRING { - return [list STRING [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $val]] - } - INT { - if {![::tomlish::utils::is_int $val]} { - error "_from_dictval_tomltype error: bad INT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list INT $val] - } - FLOAT { - if {![::tomlish::utils::is_float $val]} { - error "_from_dictval_tomltype error: bad FLOAT value '$val' - cannot encode dict to TOML-VALID TOMLISH" - } - return [list FLOAT $val] - } - default { - if {$type ni [::tomlish::tags]} { - error "_from_dictval_tomltype error: Unrecognised typename '$type' in {type value } - cannot encode dict to TOML-VALID TOMLISH" - } - return [list $type $val] - } - } - } - - #take a raw string and classify: result is a 2 element list comprised of KEY|SQKEY|DQKEY and the value being the appropriate inner string - proc _from_dict_classify_key {rawval} { - if {![::tomlish::utils::is_barekey $rawval]} { - #requires quoting - # - #Any dot in the key would have been split by to_dict - so if it's present here it's part of this key - not a level separator! - # - #we'll use a basic mechanisms for now to determine the type of quoting - # - whether it has any single quotes or not. - # (can't go in an SQKEY) - # - whether it has any chars that require quoting when in a Bstring - # (if so - then its visual representation might be unsuitable for a key in a toml text file, so escape and put in DQKEY instead of literal SQKEY) - #todo - more? - #REVIEW - the backslash might often be in things like a regex or windows path - which is often better expressed in a literal SQKEY - # from literal examples: - # 'c:\Users\nodejs\templates' - # '<\i\c*\s*>' - #If these are in *keys* our basic test will express these as: - # "c:\\Users\\nodejs\\templates" - # "<\\i\\c*\\s*>" - # This still works - but a smarter test might determine when SQKEY is the better form? - #when coming from external systems - can we even know if the value was already escaped? REVIEW - #Probably when coming from json - we know it's already escaped - and so we build our dict converting keys to unescaped - #TODO - clarify in documentation that keys resulting from to_dict are in 'normalized' (unescaped) form - # - #For keys - we currently (2025) are only allowed barekeys,basic strings and literal strings. (no multiline forms) - set k_escaped [tomlish::utils::rawstring_to_Bstring_with_escaped_controls $rawval] - if {[string length $k_escaped] != [string length $rawval]} { - #escaping made a difference - set has_escape_requirement 1 - } else { - set has_escape_requirement 0 - } - if {[string first ' $rawval] >=0 || $has_escape_requirement} { - #basic string - # (any ANSI SGR sequence will end up here in escaped form ) - return [list DQKEY $k_escaped] - } else { - #literal string - return [list SQKEY $rawval] - } - } else { - return [list KEY $rawval] - } - } - - #the quoting implies the necessary escaping for DQKEYs - proc _from_dict_join_and_quote_raw_keys {rawkeylist} { - set result "" - foreach rk $rawkeylist { - lassign [_from_dict_classify_key $rk] type val - switch -- $type { - SQKEY { - append result "'$val'." - } - DQKEY { - append result "\"$val\"." - } - KEY { - append result "$val." - } - } - } - return [string range $result 0 end-1] - } - proc _from_dictval {parents tablestack keys vinfo} { - set k [lindex $keys end] - set K_PART [_from_dict_classify_key $k] ;#get [list SQKEY ] - puts stderr "---parents:'$parents' keys:'$keys' vinfo: $vinfo---" - puts stderr "---tablestack: $tablestack---" - set result [list] - set lastparent [lindex $parents end] - if {$lastparent in [list "" do_inline]} { - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - set type [dict get $vinfo type] - #treat ITABLE differently? - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result DOTTEDKEY [list $K_PART {WS { }}] = {WS { }} $sublist {NEWLINE lf} - } else { - if {$vinfo ne ""} { - - #set result [list DOTTEDKEY [list [list KEY $k]] = ] - #set records [list ITABLE] - - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - - if {$lastparent eq "do_inline"} { - set result [list DOTTEDKEY [list $K_PART] =] - set records [list ITABLE] - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - set result [list TABLE $tname {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $k]] - set records [list] - } - - - - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - #(SQKEY & DQKEY do not have the enclosing quotes in their returned val) - #if {[regexp {\s} $vk] || [string first . $vk] >= 0} { - # set VK_PART [list SQKEY $vk] - #} else { - # set VK_PART [list KEY $vk] - #} - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - #REVIEW - we could detect if value is an array of objects, - #and depending on parent context - emit a series of TABLEARRAY records instead of a DOTTEDKEY record containing an ARRAY of objects - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART {WS { }}] = {WS { }} $sublist] - } else { - if {$vv eq ""} { - #experimental - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline KEY $vk (tablestack:$tablestack)" - #set tname [tomlish::dict::name_from_tablestack [list {*}$tablestack [list T $vk]]] - - #we can't just join normalized keys - need keys with appropriate quotes and escapes - #set tname [join [list {*}$keys $vk] .] ;#WRONG - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - } else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - set tablestack [list {*}$tablestack [list I $vk]] - } - } else { - if { 0 } { - #experiment.. sort of getting there. - if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - puts stderr "_from_dictval could uninline2 KEYS [list {*}$keys $vk] (tablestack:$tablestack)" - set tq [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - set record [list TABLE $tq {NEWLINE lf}] - set tablestack [list {*}$tablestack [list T $vk]] - - #review - todo? - set dottedkey_value [_from_dictval [list {*}$parents TABLE] $tablestack [list {*}$keys $vk] $vv] - lappend record {*}$dottedkey_value - - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } else { - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - } - if {$dictidx != $lastidx} { - #lappend record SEP - if {$lastparent eq "do_inline"} { - lappend record SEP - } else { - lappend record {NEWLINE lf} - } - } - lappend records $record - incr dictidx - } - if {$lastparent eq "do_inline"} { - lappend result $records {NEWLINE lf} - } else { - lappend result {*}$records {NEWLINE lf} - } - } else { - if {$lastparent eq "do_inline"} { - lappend result DOTTEDKEY [list $K_PART] = ITABLE {NEWLINE lf} - } else { - set tname [_from_dict_join_and_quote_raw_keys [list $k]] - lappend result TABLE $tname {NEWLINE lf} - } - } - } - } else { - #lastparent is not toplevel "" or "do_inline" - if {[tomlish::dict::is_tomlish_typeval $vinfo]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vinfo] - lappend result {*}$sublist - } else { - if {$lastparent eq "TABLE"} { - #review - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - set dottedkey_value [_from_dictval [list {*}$parents DOTTEDKEY] $tablestack [list {*}$keys $vk] $vv] - lappend result [list DOTTEDKEY [list $VK_PART] = $dottedkey_value {NEWLINE lf}] - } - } else { - if {$vinfo ne ""} { - set lastidx [expr {[dict size $vinfo] -1}] - set dictidx 0 - set sub [list] - #REVIEW - #set result $lastparent ;#e.g sets ITABLE - set result ITABLE - set last_tomltype_posn [tomlish::dict::last_tomltype_posn $vinfo] - dict for {vk vv} $vinfo { - set VK_PART [_from_dict_classify_key $vk] ;#get [list SQKEY ] - if {[tomlish::dict::is_tomlish_typeval $vv]} { - #type x value y - set sublist [_from_dictval_tomltype $parents $tablestack $keys $vv] - set record [list DOTTEDKEY [list $VK_PART] = $sublist] - } else { - if {$vv eq ""} { - #can't just uninline at this level - #we need a better method to query main dict for uninlinability at each level - # (including what's been inlined already) - #if {[lindex $parents 0] eq "" && $dictidx > $last_tomltype_posn} { - # puts stderr "_from_dictval uninline2 KEY $keys" - # set tname [_from_dict_join_and_quote_raw_keys [list {*}$keys $vk]] - # set record [list TABLE $tname {NEWLINE lf}] - # set tablestack [list {*}$tablestack [list T $vk]] - #} else { - set record [list DOTTEDKEY [list $VK_PART] = ITABLE] - #} - } else { - #set sub [_from_dictval ITABLE $vk $vv] - set dottedkey_value [_from_dictval [list {*}$parents ITABLE] $tablestack [list {*}$keys $vk] $vv] - #set record [list DOTTEDKEY [list $VK_PART] = ITABLE $dottedkey_value] - set record [list DOTTEDKEY [list $VK_PART] = $dottedkey_value] - } - } - if {$dictidx != $lastidx} { - lappend record SEP - } - lappend result $record - incr dictidx - } - } else { - puts stderr "table x-1" - lappend result DOTTEDKEY [list $K_PART] = ITABLE - } - } - } - } - return $result - } - - - proc from_dict {d} { - #consider: - # t1={a=1,b=2} - # x = 1 - #If we represent t1 as an expanded table we get - # [t1] - # a=1 - # b=2 - # x=1 - # --- which is incorrect - as x was a toplevel key like t1! - #This issue doesn't occur if x is itself an inline table - # t1={a=1,b=2} - # x= {no="problem"} - # - # (or if we were to reorder x to come before t1) - - #As the dictionary form doesn't distinguish the structure used to create tables {[table1]\nk=v} vs inline {table1={k=v}} - #Without a solution, from_dict would have to always produce the inline form for toplevel tables unless we allowed re-ordering, - #which is unpreferred here. - - #A possible solution: - #scan the top level to see if all (trailing) elements are themselves dicts - # (ie not of form {type XXX value yyy}) - # - # A further point is that if all root level values are at the 'top' - we can treat lower table-like structures as {[table]} elements - #ie we don't need to force do_inline if all the 'simple' keys are before any compound keys - - #set root_has_values 0 - #approach 1) - the naive approach - forces inline when not always necessary - #dict for {k v} $d { - # if {[llength $v] == 4 && [lindex $v 0] eq "type"} { - # set root_has_values 1 - # break - # } - #} - - - #approach 2) - track the position of last {type x value y} in the dictionary built by to_dict - # - still not perfect. Inlines dotted tables unnecessarily - #This means from_dict doesn't produce output optimal for human editing. - set last_simple [tomlish::dict::last_tomltype_posn $d] - - - ## set parent "do_inline" ;#a value used in _from_dictval to distinguish from "" or other context based parent values - #Any keys that are themselves tables - will need to be represented inline - #to avoid reordering, or incorrect assignment of plain values to the wrong table. - - ## set parent "" - #all toplevel keys in the dict structure can represent subtables. - #we are free to use {[tablename]\n} syntax for toplevel elements. - - - set tomlish [list TOMLISH] - set dictposn 0 - set tablestack [list [list T root]] ;#todo - dict for {t tinfo} $d { - if {$last_simple > $dictposn} { - set parents [list do_inline] - } else { - set parents [list ""] - } - set keys [list $t] - #review - where to make decision on - # DOTTEDKEY containing array of objs - #vs - # list of TABLEARRAY records - #At least for the top - set trecord [_from_dictval $parents $tablestack $keys $tinfo] - lappend tomlish $trecord - incr dictposn - } - return $tomlish - } - - proc json_to_toml {json} { - #*** !doctools - #[call [fun json_to_toml] [arg json]] - #[para] - - set tomlish [::tomlish::from_json $json] - set toml [::tomlish::to_toml $tomlish] - } - - #TODO use huddle? - proc from_json {json} { - #set jstruct [::tomlish::json_struct $json] - #return [::tomlish::from_json_struct $jstruct] - package require huddle - package require huddle::json - set h [huddle::json::json2huddle parse $json] - - } - - proc from_json_struct {jstruct} { - package require fish::json_toml - return [fish::json_toml::jsonstruct2tomlish $jstruct] - } - - proc toml_to_json {toml} { - set tomlish [::tomlish::from_toml $toml] - return [::tomlish::get_json $tomlish] - } - - proc get_json {tomlish} { - package require fish::json - set d [::tomlish::to_dict $tomlish] - - #return [::tomlish::dict_to_json $d] - return [fish::json::from "struct" $d] - } - - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -namespace eval tomlish::build { - #STRING,INT,FLOAT,BOOL, DATETIME - simple wrappers for completeness - # take a value of the appropriate type and wrap as a tomlish tagged item - proc STRING {s} { - return [list STRING [::tomlish::utils::rawstring_to_Bstring_with_escaped_controls $s]] - } - proc LITERAL {litstring} { - error todo - } - - proc INT {i} { - #whole numbers, may be prefixed with a + or - - #Leading zeros are not allowed - #Hex,octal binary forms are allowed (toml 1.0) - #We will error out on encountering commas, as commas are interpreted differently depending on locale (and don't seem to be supported in the toml spec anyway) - #!todo - Tcl can handle bignums - bigger than a 64bit signed long as specified in toml. - # - We should probably raise an error for number larger than this and suggest the user supply it as a string? - if {[tcl::string::last , $i] > -1} { - error "Unable to interpret '$i' as an integer. Use underscores if you need a thousands separator [::tomlish::parse::report_line]" - } - if {![::tomlish::utils::int_validchars $i]} { - error "Unable to interpret '$i' as an integer. Only 0-9 + 1 _ characters are acceptable. [::tomlish::parse::report_line]" - } - - if {[::tomlish::utils::is_int $i]} { - return [list INT $i] - } else { - error "'$i' is not a valid integer as per the Toml spec. [::tomlish::parse::report_line]" - } - - } - - proc FLOAT {f} { - #convert any non-lower case variants of special values to lowercase for Toml - if {[::tcl::string::tolower $f] in {nan +nan -nan inf +inf -inf}} { - return [list FLOAT [tcl::string::tolower $f]] - } - if {[::tomlish::utils::is_float $f]} { - return [list FLOAT $f] - } else { - error "Unable to interpret '$f' as Toml float. Check your input, or check that tomlish is able to handle all Toml floats properly [::tomlish::parse::report_line]" - } - } - - proc DATETIME {str} { - if {[::tomlish::utils::is_datetime $str]} { - return [list DATETIME $str] - } else { - error "Unable to interpret '$str' as Toml datetime. Check your input, or check that tomlish is able to handle all Toml datetimes properly [::tomlish::parse::report_line]" - } - } - - proc BOOLEAN {b} { - #convert any Tcl-acceptable boolean to boolean as accepted by toml - lower case true/false - if {![tcl::string::is boolean -strict $b]} { - error "Unable to convert '$b' to Toml boolean true|false. [::tomlish::parse::report_line]" - } else { - if {$b && 1} { - return [::list BOOL true] - } else { - return [::list BOOL false] - } - } - } - - #REVIEW - #Take tablename followed by - # a) *tomlish* name-value pairs e.g table mydata [list KEY item11 = [list STRING "test"]] {KEY item2 = [list INT 1]} - # (accept also key value {STRING }) - # b) simple 2-element tcl lists being name & *simple* value pairs for which basic heuristics will be used to determine types - proc _table {name args} { - set pairs [list] - foreach t $args { - if {[llength $t] == 4} { - if {[tcl::string::tolower [lindex $t 0]] ne "key" || [tcl::string::tolower [lindex $t 2]] ni "= value"} { - error "Only items tagged as KEY = currently accepted as name-value pairs for table command" - } - lassign $t _k keystr _eq valuepart - if {[llength $valuepart] != 2} { - error "supplied value must be typed. e.g {INT 1} or {STRING test}" - } - lappend pairs [list KEY $keystr = $valuepart] - } elseif {[llength $t] == 2} { - #!todo - type heuristics - lassign $t n v - lappend pairs [list KEY $n = [list STRING $v]] - } else { - error "'KEY = { toml but - # the first newline is not part of the data. - # we elect instead to maintain a basic LITERALPART that must not contain newlines.. - # and to compose MULTILITERAL of multiple NEWLINE LITERALPART parts, - #with the datastructure representation dropping the first newline (if immediately following opening delim) when building the value. - set literal "" - foreach part [lrange $item 1 end] { - append literal [::tomlish::encode::tomlish [list $part] $nextcontext] - } - append toml '''$literal''' - } - INT - - BOOL - - FLOAT - - DATETIME { - append toml [lindex $item 1] - } - INCOMPLETE { - error "cannot process tomlish term tagged as INCOMPLETE" - } - COMMENT { - append toml "#[lindex $item 1]" - } - BOM { - #Byte Order Mark may appear at beginning of a file. Needs to be preserved. - append toml "\uFEFF" - } - default { - error "Not a properly formed 'tomlish' taggedlist.\n '$list'\n Unknown tag '[lindex $item 0]'. See output of \[tomlish::tags\] command." - } - } - - } - return $toml - } - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::encode ---}] -} -#fish toml from tomlish - -#(encode tomlish as toml) -interp alias {} tomlish::to_toml {} tomlish::encode::tomlish - -# - - -namespace eval tomlish::decode { - #*** !doctools - #[subsection {Namespace tomlish::decode}] - #[para] - #[list_begin definitions] - - #return a Tcl list of tomlish tokens - #i.e get a standard list of all the toml terms in string $s - #where each element of the list is a *tomlish* term.. i.e a specially 'tagged' Tcl list. - #(simliar to a tcl 'Huddle' - but also supporting whitespace preservation) - # ---------------------------------------------------------------------------------------------- - # NOTE: the production of tomlish from toml source doesn't indicate the toml source was valid!!! - # e.g we deliberately don't check certain things such as duplicate table declarations here. - # ---------------------------------------------------------------------------------------------- - #Part of the justification for this is that as long as the syntax is toml shaped - we can load files which violate certain rules and allow programmatic manipulation. - # (e.g perhaps a toml editor to highlight violations for fixing) - # A further stage is then necessary to load the tomlish tagged list into a data structure more suitable for efficient query/reading. - # e.g dicts or an object oriented structure - #Note also - *no* escapes in quoted strings are processed. This is up to the datastructure stage - #e.g to_dict will substitute \r \n \uHHHH \UHHHHHHH etc - #This is important for tomlish to maintain the ability to perform competely lossless round-trips from toml to tomlish and back to toml. - # (which is handy for testing as well as editing some part of the structure with absolutely no effect on other parts of the document) - #If we were to unescape a tab character for example - # - we have no way of knowing if it was originally specified as \t \u0009 or \U00000009 or directly as a tab character. - # For this reason, we also do absolutely no line-ending transformations based on platform. - # All line-endings are maintained as is, and even a file with mixed lf crlf line-endings will be correctly interpreted and can be 'roundtripped' - - proc toml {args} { - #*** !doctools - #[call [fun toml] [arg arg...]] - #[para] return a Tcl list of tomlish tokens - - set s [join $args \n] - - namespace upvar ::tomlish::parse is_parsing is_parsing - set is_parsing 1 - - if {[info command ::tomlish::parse::spacestack] eq "::tomlish::parse::spacestack"} { - tomlish::parse::spacestack destroy - } - struct::stack ::tomlish::parse::spacestack - - namespace upvar ::tomlish::parse last_space_action last_space_action - namespace upvar ::tomlish::parse last_space_type last_space_type - - namespace upvar ::tomlish::parse tok tok - set tok "" - - namespace upvar ::tomlish::parse type type - namespace upvar ::tomlish::parse tokenType tokenType - ::tomlish::parse::set_tokenType "" - namespace upvar ::tomlish::parse tokenType_list tokenType_list - set tokenType [list] ;#Flat (un-nested) list of tokentypes found - - namespace upvar ::tomlish::parse lastChar lastChar - set lastChar "" - - - set result "" - namespace upvar ::tomlish::parse nest nest - set nest 0 - - namespace upvar ::tomlish::parse v v ;#array keyed on nest level - - - set v(0) {TOMLISH} - array set s0 [list] ;#whitespace data to go in {SPACE {}} element. - set parentlevel 0 - - namespace upvar ::tomlish::parse i i - set i 0 - - namespace upvar ::tomlish::parse state state - - namespace upvar ::tomlish::parse braceCount braceCount - set barceCount 0 - namespace upvar ::tomlish::parse bracketCount bracketCount - set bracketCount 0 - - set sep 0 - set r 1 - namespace upvar ::tomlish::parse token_waiting token_waiting - set token_waiting [dict create] ;#if ::tok finds a *complete* second token during a run, it will put the 2nd one here to be returned by the next call. - - - set state "table-space" - ::tomlish::parse::spacestack push {type space state table-space} - namespace upvar ::tomlish::parse linenum linenum;#'line number' of input data. (incremented for each literal linefeed - but not escaped ones in data) - set linenum 1 - - set ::tomlish::parse::state_list [list] - try { - while {$r} { - set r [::tomlish::parse::tok $s] - #puts stdout "got tok: '$tok' while parsing string '$s' " - set next_tokenType_known 0 ;#whether we begin a new token here based on what terminated the token result of 'tok' - - - #puts "got token: '$tok' tokenType='$tokenType'. while v($nest) = [set v($nest)]" - #puts "-->tok: $tok tokenType='$tokenType'" - set prevstate $state - set transition_info [::tomlish::parse::goNextState $tokenType $tok $state] - #review goNextState could perform more than one space_action - set space_action [dict get $transition_info space_action] - set newstate [dict get $transition_info newstate] ;#use of 'newstate' vs 'state' makes code clearer below - - if {[tcl::string::match "err-*" $state]} { - ::tomlish::log::warn "---- State error in state $prevstate for tokenType: $tokenType token value: $tok. $state aborting parse. [tomlish::parse::report_line]" - lappend v(0) [list ERROR tokentype $tokenType state $prevstate to $state leveldata [set v($nest)]] - return $v(0) - } - # --------------------------------------------------------- - #NOTE there may already be a token_waiting at this point - #set_token_waiting can raise an error here, - # in which case the space_action branch needs to be rewritten to handle the existing token_waiting - # --------------------------------------------------------- - - if {$space_action eq "pop"} { - #pop_trigger_tokens: newline tablename endarray endinlinetable - #note a token is a pop trigger depending on context. e.g first newline during keyval is a pop trigger. - set parentlevel [expr {$nest -1}] - set do_append_to_parent 1 ;#most tokens will leave this alone - but some like tentative_accum_squote need to do their own append - switch -exact -- $tokenType { - tentative_accum_squote { - #should only apply within a multiliteral - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the xxx-squote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-squote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-squote-space { - } - default { - error "--- unexpected popped due to tentative_accum_squote but came from state '$prevstate' should have been trailing-squote-space" - } - } - switch -- $tok { - ' { - tomlish::parse::set_token_waiting type single_squote value $tok complete 1 startindex [expr {$i -1}] - } - '' { - #review - we should perhaps return double_squote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_squote value "" complete 1 startindex [expr {$i - 2}] - } - ''' { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 3}] - } - '''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left squote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]'" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "'"] - } - MULTILITERAL { - #empty - lappend v($parentlevel) [list LITERALPART "'"] - } - default { - error "--- don't know how to integrate extra trailing squote with data $v($parentlevel)" - } - } - } - ''''' { - tomlish::parse::set_token_waiting type triple_squote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 squotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - LITERALPART { - set newval "[lindex $lastpart 1]''" - set parentdata $v($parentlevel) - lset parentdata end [list LITERALPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE { - lappend v($parentlevel) [list LITERALPART "''"] - } - MULTILITERAL { - lappend v($parentlevel) [list LITERALPART "''"] - } - default { - error "--- don't know how to integrate extra trailing 2 squotes with data $v($parentlevel)" - } - } - } - } - } - triple_squote { - #presumably popping multiliteral-space - ::tomlish::log::debug "---- triple_squote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTILITERAL { - lappend merged $part - } - LITERALPART { - if {$lasttype eq "LITERALPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_squote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - tentative_accum_dquote { - #should only apply within a multistring - #### - set do_append_to_parent 0 ;#mark false to indicate we will do our own appends if needed - #Without this - we would get extraneous empty list entries in the parent - # - as the trailing-dquote-space isn't a space level from the toml perspective - # - the use of a space is to give us a hook here to (possibly) integrate extra quotes into the parent space when we pop - #assert prevstate always trailing-dquote-space - #dev guardrail - remove? assertion lib? - switch -exact -- $prevstate { - trailing-dquote-space { - } - default { - error "--- unexpected popped due to tentative_accum_dquote but came from state '$prevstate' should have been trailing-dquote-space" - } - } - switch -- $tok { - {"} { - tomlish::parse::set_token_waiting type single_dquote value $tok complete 1 startindex [expr {$i -1}] - } - {""} { - #review - we should perhaps return double_dquote instead? - #tomlish::parse::set_token_waiting type literal value "" complete 1 - tomlish::parse::set_token_waiting type double_dquote value "" complete 1 startindex [expr {$i - 2}] - } - {"""} { - #### - #if already an eof in token_waiting - set_token_waiting will insert before it - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 3}] - } - {""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i - 4}] - #todo integrate left dquote with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {"}] - } - MULTISTRING { - #empty - lappend v($parentlevel) [list STRINGPART {"}] - } - default { - error "--- don't know how to integrate extra trailing dquote with data $v($parentlevel)" - } - } - } - {"""""} { - tomlish::parse::set_token_waiting type triple_dquote value $tok complete 1 startindex [expr {$i-5}] - #todo integrate left 2 dquotes with nest data at this level - set lastpart [lindex $v($parentlevel) end] - switch -- [lindex $lastpart 0] { - STRINGPART { - set newval "[lindex $lastpart 1]\"\"" - set parentdata $v($parentlevel) - lset parentdata end [list STRINGPART $newval] - set v($parentlevel) $parentdata - } - NEWLINE - CONT - WS { - lappend v($parentlevel) [list STRINGPART {""}] - } - MULTISTRING { - lappend v($parentlevel) [list STRINGPART {""}] - } - default { - error "--- don't know how to integrate extra trailing 2 dquotes with data $v($parentlevel)" - } - } - } - } - } - triple_dquote { - #presumably popping multistring-space - ::tomlish::log::debug "---- triple_dquote for last_space_action pop leveldata: $v($nest)" - set merged [list] - set lasttype "" - foreach part $v($nest) { - switch -exact -- [lindex $part 0] { - MULTISTRING { - lappend merged $part - } - STRINGPART { - if {$lasttype eq "STRINGPART"} { - set prevpart [lindex $merged end] - lset prevpart 1 [lindex $prevpart 1][lindex $part 1] - lset merged end $prevpart - } else { - lappend merged $part - } - } - CONT - WS { - lappend merged $part - } - NEWLINE { - #note that even though first newline ultimately gets stripped from multiliterals - that isn't done here - #we still need the first one for roundtripping. The datastructure stage is where it gets stripped. - lappend merged $part - } - default { - error "---- triple_dquote unhandled part type [lindex $part 0] unable to merge leveldata: $v($nest)" - } - } - set lasttype [lindex $part 0] - } - set v($nest) $merged - } - equal { - #pop caused by = - switch -exact -- $prevstate { - dottedkey-space { - tomlish::log::debug "---- equal ending dottedkey-space for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - dottedkey-space-tail { - #experiment? - tomlish::log::debug "---- equal ending dottedkey-space-tail for last_space_action pop" - #re-emit for parent space - tomlish::parse::set_token_waiting type equal value = complete 1 startindex [expr {$i-1}] - } - } - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - tablename { - #note: a tablename only 'pops' if we are greater than zero - error "---- tablename pop should already have been handled as special case zeropoppushspace in goNextState" - } - tablearrayname { - #!review - tablearrayname different to tablename regarding push/pop? - #note: a tablename only 'pops' if we are greater than zero - error "---- tablearrayname pop should already have been handled as special case zeropoppushspace in goNextState" - } - endarray { - #nothing to do here. - } - comma { - #comma for inline table will pop the keyvalue space - lappend v($nest) "SEP" - } - endinlinetable { - ::tomlish::log::debug "---- endinlinetable for last_space_action pop" - } - default { - error "---- unexpected tokenType '$tokenType' for last_space_action 'pop'" - } - } - if {$do_append_to_parent} { - #e.g tentative_accum_squote does it's own appends as necessary - so won't get here - lappend v($parentlevel) [set v($nest)] - } - - incr nest -1 - - } elseif {$last_space_action eq "push"} { - set prevnest $nest - incr nest 1 - set v($nest) [list] - # push_trigger_tokens: barekey dquotedkey startinlinetable startarray tablename tablearrayname - - - switch -exact -- $tokenType { - tentative_trigger_squote - tentative_trigger_dquote { - #### this startok will always be tentative_accum_squote/tentative_accum_dquote starting with one accumulated squote/dquote - if {[dict exists $transition_info starttok] && [dict get $transition_info starttok] ne ""} { - lassign [dict get $transition_info starttok] starttok_type starttok_val - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType $starttok_type - set tok $starttok_val - } - } - single_squote { - #JMN - REVIEW - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - triple_squote { - ::tomlish::log::debug "---- push trigger tokenType triple_squote" - set v($nest) [list MULTILITERAL] ;#container for NEWLINE,LITERALPART - } - squotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - triple_dquote { - set v($nest) [list MULTISTRING] ;#container for NEWLINE,STRINGPART,CONT - } - dquotedkey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - barekey { - switch -exact -- $prevstate { - table-space - itable-space { - set v($nest) [list DOTTEDKEY] - } - } - #todo - check not something already waiting? - set waiting [tomlish::parse::get_token_waiting] - if {[llength $waiting]} { - set i [dict get $waiting startindex] - tomlish::parse::clear_token_waiting - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } else { - tomlish::parse::set_token_waiting type $tokenType value $tok complete 1 startindex [expr {$i -[tcl::string::length $tok]}] ;#re-submit token in the newly pushed space - } - } - tablename { - #note: we do not use the output of tomlish::to_dict::tablename_trim to produce a tablename for storage in the tomlish list! - #The tomlish list is intended to preserve all whitespace (and comments) - so a roundtrip from toml file to tomlish - # back to toml file will be identical. - #It is up to the datastructure stage to normalize and interpret tomlish for programmatic access. - # we call tablename_trim here only to to validate that the tablename data is well-formed at the outermost level, - # so we can raise an error at this point rather than create a tomlish list with obviously invalid table names from - # a structural perspective. - - #todo - review! It's arguable that we should not do any validation here, and just store even incorrect raw tablenames, - # so that the tomlish list is more useful for say a toml editor. Consider adding an 'err' tag to the appropriate place in the - # tomlish list? - - #set trimtable [::tomlish::to_dict::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablename: '$trimtable'" - set v($nest) [list TABLE $tok] ;#$tok is the *raw* table name - #note also that equivalent tablenames may have different toml representations even after being trimmed! - #e.g ["x\t\t"] & ["x "] (tab escapes vs literals) - #These will show as above in the tomlish list, but should normalize to the same tablename when used as keys by the datastructure stage. - } - tablearrayname { - #set trimtable [::tomlish::to_dict::tablename_trim $tok] - #::tomlish::log::debug "---- trimmed (but not normalized) tablearrayname: '$trimtable'" - set v($nest) [list TABLEARRAY $tok] ;#$tok is the *raw* tablearray name - } - startarray { - set v($nest) [list ARRAY] ;#$tok is just the opening bracket - don't output. - } - startinlinetable { - set v($nest) [list ITABLE] ;#$tok is just the opening curly brace - don't output. - } - default { - error "---- push trigger tokenType '$tokenType' not yet implemented" - } - } - - } else { - #no space level change - switch -exact -- $tokenType { - squotedkey { - #puts "---- squotedkey in state $prevstate (no space level change)" - lappend v($nest) [list SQKEY $tok] - } - dquotedkey { - #puts "---- dquotedkey in state $prevstate (no space level change)" - lappend v($nest) [list DQKEY $tok] - } - barekey { - lappend v($nest) [list KEY $tok] - } - dotsep { - lappend v($nest) [list DOTSEP] - } - starttablename { - #$tok is triggered by the opening bracket and sends nothing to output - } - starttablearrayname { - #$tok is triggered by the double opening brackets and sends nothing to output - } - tablename - tablenamearray { - error "---- did not expect 'tablename/tablearrayname' without space level change (no space level change)" - #set v($nest) [list TABLE $tok] - } - endtablename - endtablearrayname { - #no output into the tomlish list for this token - } - startinlinetable { - puts stderr "---- decode::toml error. did not expect startinlinetable without space level change (no space level change)" - } - single_dquote { - switch -exact -- $newstate { - string-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "string" - set tok "" - } - dquoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "dquotedkey" - set tok "" - } - multistring-space { - lappend v($nest) [list STRINGPART {"}] - #may need to be joined on pop if there are neighbouring STRINGPARTS - } - default { - error "---- single_dquote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - double_dquote { - #leading extra quotes - test: toml_multistring_startquote2 - switch -exact -- $prevstate { - itable-keyval-value-expected - keyval-value-expected { - puts stderr "tomlish::decode::toml double_dquote TEST" - #empty string - lappend v($nest) [list STRINGPART ""] - } - multistring-space { - #multistring-space to multistring-space - lappend v($nest) [list STRINGPART {""}] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - - } - single_squote { - switch -exact -- $newstate { - literal-state { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "literal" - set tok "" - } - squoted-key { - set next_tokenType_known 1 - ::tomlish::parse::set_tokenType "squotedkey" - set tok "" - } - multiliteral-space { - #false alarm squote returned from tentative_accum_squote pop - ::tomlish::log::debug "---- adding lone squote to own LITERALPART nextstate: $newstate (no space level change)" - #(single squote - not terminating space) - lappend v($nest) [list LITERALPART '] - #may need to be joined on pop if there are neighbouring LITERALPARTs - } - default { - error "---- single_squote switch case not implemented for nextstate: $newstate (no space level change)" - } - } - } - double_squote { - switch -exact -- $prevstate { - keyval-value-expected { - lappend v($nest) [list LITERAL ""] - } - multiliteral-space { - #multiliteral-space to multiliteral-space - lappend v($nest) [list LITERALPART ''] - } - default { - error "--- unhandled tokenType '$tokenType' when transitioning from state $prevstate to $newstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - enddquote { - #nothing to do? - set tok "" - } - endsquote { - set tok "" - } - string { - lappend v($nest) [list STRING $tok] ;#directly wrapped in dquotes - } - literal { - lappend v($nest) [list LITERAL $tok] ;#directly wrapped in squotes - } - multistring { - #review - lappend v($nest) [list MULTISTRING $tok] - } - stringpart { - lappend v($nest) [list STRINGPART $tok] ;#will not get wrapped in dquotes directly - } - multiliteral { - lappend v($nest) [LIST MULTILITERAL $tok] - } - literalpart { - lappend v($nest) [list LITERALPART $tok] ;#will not get wrapped in squotes directly - } - untyped_value { - #would be better termed unclassified_value - #we can't determine the type of unquoted values (int,float,datetime,bool) until the entire token was read. - unset -nocomplain tag - if {$tok in {true false}} { - set tag BOOL - } else { - if {[::tomlish::utils::is_int $tok]} { - set tag INT - } else { - if {[string is integer -strict $tok]} { - #didn't qualify as a toml int - but still an int - #probably means is_int is limiting size and not accepting bigints (configurable?) - #or it didn't qualify due to more than 1 leading zero - #or other integer format issue such as repeated underscores - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. (looks close to being an int. Formatting or range issue?) [tomlish::parse::report_line] (no space level change)" - } else { - if {[::tomlish::utils::is_float $tok]} { - set tag FLOAT - } elseif {[::tomlish::utils::is_datetime $tok] || [::tomlish::utils::is_timepart $tok]} { - #Note we must allow lone timepart here (not just is_localtime which doesn't allow tz offsets) in case it followed a localdate - #e.g x= 2025-01-01 02:34Z - #The to_dict validation will catch an invalid standaline timepart, or combine with leading date if applicable. - set tag DATETIME - } else { - error "---- Unable to interpret '$tok' as Boolean, Integer, Float or Datetime as per the toml specs. [tomlish::parse::report_line] (no space level change)" - } - } - } - } - #assert either tag is set, or we errored out. - lappend v($nest) [list $tag $tok] - - } - comment { - #puts stdout "----- comment token returned '$tok'------" - lappend v($nest) [list COMMENT "$tok"] - } - equal { - #we append '=' to the nest so that any surrounding whitespace is retained. - lappend v($nest) = - } - comma { - lappend v($nest) SEP - } - newline { - incr linenum - lappend v($nest) [list NEWLINE $tok] - } - whitespace { - lappend v($nest) [list WS $tok] - } - continuation { - lappend v($nest) CONT - } - bom { - lappend v($nest) BOM - } - eof { - #ok - nothing more to add to the tomlish list. - #!todo - check previous tokens are complete/valid? - } - default { - error "--- unknown tokenType '$tokenType' during state $prevstate [::tomlish::parse::report_line] (no space level change)" - } - } - } - - if {!$next_tokenType_known} { - ::tomlish::log::notice "---- tomlish::decode::toml - current tokenType:$tokenType Next token type not known" - ::tomlish::parse::set_tokenType "" - set tok "" - } - - if {$state eq "end-state"} { - break - } - - - } - - #while {$nest > 0} { - # lappend v([expr {$nest -1}]) [set v($nest)] - # incr nest -1 - #} - while {[::tomlish::parse::spacestack size] > 1} { - ::tomlish::parse::spacestack pop - lappend v([expr {$nest -1}]) [set v($nest)] - incr nest -1 - - #set parent [spacestack peek] ;#the level being appended to - #lassign $parent type state - #if {$type eq "space"} { - # - #} elseif {$type eq "buffer"} { - # lappend v([expr {$nest -1}]) {*}[set v($nest)] - #} else { - # error "invalid spacestack item: $parent" - #} - } - - } finally { - set is_parsing 0 - } - return $v(0) - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::decode ---}] -} -#decode toml to tomlish -interp alias {} tomlish::from_toml {} tomlish::decode::toml - -namespace eval tomlish::utils { - #*** !doctools - #[subsection {Namespace tomlish::utils}] - #[para] - #[list_begin definitions] - - - - #basic generic quote matching for single and double quotes - #note for example that {[o'malley]} will return sq - as the single quote is not closed or wrapped in double quotes - proc tok_in_quotedpart {tok} { - set sLen [tcl::string::length $tok] - set quote_type "" - set had_slash 0 - for {set i 0} {$i < $sLen} {incr i} { - set c [tcl::string::index $tok $i] - if {$quote_type eq ""} { - if {$had_slash} { - #don't enter quote mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - set quote_type dq - } - sq { - set quote_type sq - } - bsl { - set had_slash 1 - } - } - } - } else { - if {$had_slash} { - #don't leave quoted mode - #leave slash_mode because even if current char is slash - it is escaped - set had_slash 0 - } else { - set ctype [tcl::string::map [list {"} dq {'} sq \\ bsl] $c] - switch -- $ctype { - dq { - if {$quote_type eq "dq"} { - set quote_type "" - } - } - sq { - if {$quote_type eq "sq"} { - set quote_type "" - } - } - bsl { - set had_slash 1 - } - } - } - } - } - return $quote_type ;#dq | sq - } - - - proc unicode_escape_info {slashu} { - #!todo - # validate that slashu is either a \uxxxx or \Uxxxxxxxx value of the correct length and - # is a valid 'unicode scalar value' (any Unicode code point except high-surrogate and low-surrogate code points) - # ie integers in the range 0 to D7FF16 and E00016 to 10FFFF16 inclusive - #expr {(($x >= 0) && ($x <= 0xD7FF16)) || (($x >= 0xE00016) && ($x <= 0x10FFFF16))} - if {[tcl::string::match {\\u*} $slashu]} { - set exp {^\\u([0-9a-fA-F]{4}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %4x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } - } else { - return [list err [list reason "Supplied string not of the form \\uHHHH where H in \[0-9a-fA-F\]"]] - } - } elseif {[tcl::string::match {\\U*} $slashu]} { - set exp {^\\U([0-9a-fA-F]{8}$)} - if {[regexp $exp $slashu match hex]} { - if {[scan $hex %8x dec] != 1} { - #why would a scan ever fail after matching the regexp? !todo - review. unreachable branch? - return [list err [list reason "Failed to convert '$hex' to decimal"]] - } else { - if {(($dec >= 0) && ($dec <= 0xD7FF16)) || (($dec >= 0xE00016) && ($dec <= 0x10FFFF16))} { - return [list ok [list char [subst -nocommand -novariable $slashu]]] - } else { - return [list err [list reason "$slashu is not within the 'unicode scalar value' ranges 0 to 0xD7FF16 or 0xE00016 to 0x10FFFF16"]] - } - } - } else { - return [list err [list reason "Supplied string not of the form \\UHHHHHHHH where H in \[0-9a-fA-F\]"]] - } - } else { - return [list err [list reason "Supplied string did not start with \\u or \\U" ]] - } - - } - - #Note that unicode characters don't *have* to be escaped. - #So if we provide a function named 'escape_string', the name implies the inverse of unescape_string which unescapes unicode \u \U values. - #- an inverse of unescape_string would encode all unicode chars unnecessarily. - #- as toml accepts a compact escape sequence for common chars such as tab,backspace,linefeed etc but also allows the full form \u009 etc - #- escape_string and unescape_string would not be reliably roundtrippable inverses anyway. - #REVIEW - provide it anyway? When would it be desirable to use? - - variable Bstring_control_map [dict create] - dict set Bstring_control_map \b {\b} - dict set Bstring_control_map \n {\n} - dict set Bstring_control_map \r {\r} - dict set Bstring_control_map \" {\"} - dict set Bstring_control_map \x1b {\e} ;#In spec it's included in the list of 'must be escaped', as well as the 'convenience' escapes - so we make it go both ways. - dict set Bstring_control_map \\ "\\\\" - - #\e for \x1b seems like it might be included - v1.1?? hard to find current state of where toml is going :/ - #for a Bstring (Basic string) tab is explicitly mentioned as not being one that must be escaped. - #8 = \b - already in list. - #built the remainder whilst checking for entries already hardcoded above -in case more are added to the hardcoded list - for {set cdec 0} {$cdec <= 7} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh - } - } - for {set cdec [expr {0x0A}]} {$cdec <= 0x1F} {incr cdec} { - set hhhh [format %.4X $cdec] - set char [format %c $cdec] - if {![dict exists $Bstring_control_map $char]} { - dict set Bstring_control_map $char \\u$hhhh - } - } - # \u007F = 127 - dict set Bstring_control_map [format %c 127] \\u007F - - #Note the inclusion of backslash in the list of controls makes this non idempotent - subsequent runs would keep encoding the backslashes! - #escape only those chars that must be escaped in a Bstring (e.g not tab which can be literal or escaped) - #for example - can be used by from_dict to produce valid Bstring data for a tomlish record - proc rawstring_to_Bstring_with_escaped_controls {str} { - #for the well known chars that have compact escape sequences allowed by toml - we choose that form over the full \u form. - #we'll use a string map with an explicit list rather than algorithmic at runtime - # - the string map is probably more performant than splitting a string, especially if it's large - variable Bstring_control_map - return [string map $Bstring_control_map $str] - } - - #review - unescape what string? Bstring vs MLBstring? - #we should be specific in the function naming here - #used by to_dict - so part of validation? - REVIEW - proc unescape_string {str} { - #note we can't just use Tcl subst because: - # it also transforms \a (audible bell) and \v (vertical tab) which are not in the toml spec. - # it would strip out backslashes inappropriately: e.g "\j" becomes just j - # it recognizes other escapes which aren't approprite e.g \xhh and octal \nnn - # it replaces \ with a single whitespace (trailing backslash) - #This means we shouldn't use 'subst' on the whole string, but instead substitute only the toml-specified escapes (\r \n \b \t \f \\ \" \uhhhh & \Uhhhhhhhh - #plus \e for \x1b? - - set buffer "" - set buffer4 "" ;#buffer for 4 hex characters following a \u - set buffer8 "" ;#buffer for 8 hex characters following a \u - - set sLen [tcl::string::length $str] - - #we need to handle arbitrarily long sequences of backslashes. \\\\\ etc - set slash_active 0 - set unicode4_active 0 - set unicode8_active 0 - - ::tomlish::log::debug "unescape_string. got len [string length str] str $str" - - #!todo - check for invalid data in the form of a raw carriage return (decimal 13) without following linefeed? - set i 0 - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $str [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $str $i] - #::tomlish::log::debug "unescape_string. got char $c" ;#too much? - - #---------------------- - #as we are 'unescaping' - should we really be testing for existing values that should have been escaped? - #this test looks incomplete anyway REVIEW - scan $c %c n - if {($n <= 31) && ($n != 9) && ($n != 10) && ($n != 13)} { - #we don't expect unescaped unicode characters from 0000 to 001F - - #*except* for raw tab (which is whitespace) and newlines - error "unescape_string. Invalid data for a toml string. Unescaped control character (decimal $n) [::tomlish::utils::string_to_slashu $c]" - } - #---------------------- - - incr i ;#must incr here because we do'returns'inside the loop - if {$c eq "\\"} { - if {$slash_active} { - append buffer "\\" - set slash_active 0 - } elseif {$unicode4_active} { - error "unescape_string. unexpected case slash during unicode4 not yet handled" - } elseif {$unicode8_active} { - error "unescape_string. unexpected case slash during unicode8 not yet handled" - } else { - # don't output anything (yet) - set slash_active 1 - } - } else { - if {$unicode4_active} { - if {[tcl::string::length $buffer4] < 4} { - append buffer4 $c - } - if {[tcl::string::length $buffer4] == 4} { - #we have a \uHHHH to test - set unicode4_active 0 - set result [tomlish::utils::unicode_escape_info "\\u$buffer4"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$unicode8_active} { - if {[tcl::string::length $buffer8] < 8} { - append buffer8 $c - } - if {[tcl::string::length $buffer8] == 8} { - #we have a \UHHHHHHHH to test - set unicode8_active 0 - set result [tomlish::utils::unicode_escape_info "\\U$buffer8"] - if {[lindex $result 0] eq "ok"} { - append buffer [dict get $result ok char] - } else { - error "unescape_string error: [lindex $result 1]" - } - } - } elseif {$slash_active} { - set slash_active 0 - set ctest [tcl::string::map {{"} dq} $c] - switch -exact -- $ctest { - dq { - append buffer {"} - } - b - t - n - f - r { - append buffer [subst -nocommand -novariable "\\$c"] - } - e { - append buffer \x1b - } - u { - set unicode4_active 1 - set buffer4 "" - } - U { - set unicode8_active 1 - set buffer8 "" - } - default { - set slash_active 0 - #review - toml spec says all other escapes are reserved - #and if they are used TOML should produce an error. - #we leave detecting this for caller for now - REVIEW - append buffer "\\$c" - } - } - } else { - append buffer $c - } - } - } - #puts stdout "EOF 4:$unicode4_active 8:$unicode8_active slash:$slash_active" - if {$unicode4_active} { - error "End of string reached before complete unicode escape sequence \uHHHH" - } - if {$unicode8_active} { - error "End of string reached before complete unicode escape sequence \UHHHHHHHH" - } - if {$slash_active} { - append buffer "\\" - } - return $buffer - } - - #This does not have to do with unicode normal forms - which it seems toml has decided against regarding use in keys (review/references?) - #This is meant for internal use regarding ensuring we match equivalent keys which may have just been specified with different string mechanisms, - #e.g squoted vs dquoted vs barekey. - proc normalize_key {rawkey} { - set c1 [tcl::string::index $rawkey 0] - set c2 [tcl::string::index $rawkey end] - if {($c1 eq "'") && ($c2 eq "'")} { - #single quoted segment. No escapes allowed within it. - set key [tcl::string::range $rawkey 1 end-1] - } elseif {($c1 eq "\"") && ($c2 eq "\"")} { - #double quoted segment. Apply escapes. - # - set keydata [tcl::string::range $rawkey 1 end-1] ;#strip outer quotes only - #e.g key could have mix of \UXXXXXXXX escapes and unicode chars - #or mix of \t and literal tabs. - #unescape to convert all to literal versions for comparison - set key [::tomlish::utils::unescape_string $keydata] - #set key [subst -nocommands -novariables $keydata] ;#wrong. Todo - create a string escape substitution function. - } else { - set key $rawkey - } - return $key - } - - proc string_to_slashu {string} { - set rv {} - foreach c [split $string {}] { - scan $c %c cdec - if {$cdec > 65535} { - append rv {\U} [format %.8X $cdec] - } else { - append rv {\u} [format %.4X $cdec] - } - } - return $rv - } - - #'nonprintable' is conservative here because some systems (e.g windows console) are very limited in what they can display. - #This is used for display purposes only (error msgs) - proc nonprintable_to_slashu {s} { - set res "" - foreach i [split $s ""] { - scan $i %c cdec - - set printable 0 - if {($cdec>31) && ($cdec<127)} { - set printable 1 - } - if {$printable} { - append res $i - } else { - if {$cdec > 65535} { - append res \\U[format %.8X $cdec] - } else { - append res \\u[format %.4X $cdec] - } - } - } - set res - } ;# initial version from tcl wiki RS - - #check if str is valid for use as a toml bare key - #Early toml versions? only allowed letters + underscore + dash - proc is_barekey1 {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } else { - set matches [regexp -all {[a-zA-Z0-9\_\-]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters match the regexp - return 1 - } else { - return 0 - } - } - } - - #from toml.abnf in github.com/toml-lang/toml - #unquoted-key = 1*unquoted-key-char - #unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ - #unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions - #unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block - #unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon - #unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - #unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics - #unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - #unquoted-key-char =/ %x2070-21FF / %x2300-24FF ; skip math operators - #unquoted-key-char =/ %x25A0-268B / %x2690-2757 ; skip box drawing, block elements, and some yin-yang symbols - #unquoted-key-char =/ %x2762-2767 / %x2776-27E5 ; skip some Dingbat punctuation - #unquoted-key-char =/ %x2801-297F ; skip some math brackets and arrows, and braille blank - #unquoted-key-char =/ %x2B00-2FFF / %x3001-D7FF ; skip various math operators and symbols, and ideographic space - #unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - #unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - variable re_barekey - set ranges [list] - lappend ranges {a-zA-Z0-9\_\-} - lappend ranges {\u00B2} {\u00B3} {\u00B9} {\u00BC-\u00BE} ;# superscript digits, fractions - lappend ranges {\u00C0-\u00D6} {\u00D8-\u00F6} {\u00F8-\u037D} ;# non-symbol chars in Latin block - lappend ranges {\u037f-\u1FFF} ;# exclude GREEK QUESTION MARK, which is basically a semi-colon - lappend ranges {\u200C-\u200D} {\u203F-\u2040} ;# from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ - lappend ranges {\u2070-\u218f} {\u2460-\u24FF} ;# include super-subscripts, letterlike/numberlike forms, enclosed alphanumerics - lappend ranges {\u2C00-\u2FEF} {\u3001-\uD7FF} ;# skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces - lappend ranges {\u2070-\u21FF} {\u2300-\u24FF} ;# skip math operators - lappend ranges {\u25A0-\u268B} {\u2690-\u2757} ;# skip box drawing, block elements, and some yin-yang symbols - lappend ranges {\u2762-\u2767} {\u2776-\u27E5} ;# skip some Dingbat punctuation - lappend ranges {\u2801-\u297F} ;# skip some math brackets and arrows, and braille blank - lappend ranges {\u2B00-\u2FFF} {\u3001-\uD7FF} ;# skip various math operators and symbols, and ideographic space - lappend ranges {\uF900-\uFDCF} {\uFDF0-\uFFFD} ;# skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) - lappend ranges {\U10000-\UEFFFF} ;# all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) - set re_barekey {^[} - foreach r $ranges { - append re_barekey $r - } - append re_barekey {]+$} - - proc is_barekey {str} { - if {[tcl::string::length $str] == 0} { - return 0 - } - variable re_barekey - return [regexp $re_barekey $str] - } - - #test only that the characters in str are valid for the toml specified type 'integer'. - proc int_validchars1 {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_\-\+]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - #add support for hex,octal,binary 0x.. 0o.. 0b... - proc int_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_int {str} { - set matches [regexp -all {[0-9\_xo\-\+A-Fa-f]} $str] ;#0b101 etc covered by a-f - - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - - # --------------------------------------- - #check for leading zeroes in non 0x 0b 0o - #first strip any +, - or _ (just for this test) - #(but still allowing 0 -0 +0) - set check [tcl::string::map {+ "" - "" _ ""} $str] - if {([tcl::string::length $check] > 1) && ([tcl::string::index $check 0] eq "0") && ([tcl::string::index $check 1] ni {o x b})} { - return 0 - } - # --------------------------------------- - - #check +,- only occur in the first position. (excludes also +++1 etc) - if {[tcl::string::last - $str] > 0} { - return 0 - } - if {[tcl::string::last + $str] > 0} { - return 0 - } - - #------------------------------------------- - #unclear if a 'digit' includes the type specifiers x b o - #we assume the 0x 0b 0o are NOT counted as digits - as underscores here would seem - #to be likely to cause interop issues with other systems - #(e.g tcl allows 0b1_1 but not 0b_11) - #Most of this structure would be unnecessary if we could rely on string::is::integer understanding underscores (9+?) - #we still need to support earlier Tcl for now though. - - #first rule out any case with more than one underscore in a row - if {[regexp {__} $str]} { - return 0 - } - if {[string index $str 0] eq "_"} { - return 0 - } - set utest [string trimleft $str +-] - #test again for further trick like _+_0xFF - if {[string index $utest 0] eq "_"} { - return 0 - } - if {[string range $utest 0 1] in {0x 0b 0o}} { - set testnum [string range $utest 2 end] - } else { - set testnum $utest - #exclude also things like 0_x 0___b that snuck past our prefix test - if {![string is digit -strict [string map {_ ""} $testnum]]} { - return 0 - } - #assert - only digits and underscores in testnum - #still may have underscores at each end - } - #assert testnum is now the 'digits' portion of a , 0x 0b 0o number - #(+ and - already stripped) - #It may still have chars unsuitable for its type - which will be caught by the string::is::integer test below - if {[string length $testnum] != [string length [string trim $testnum _]]} { - #had non-inner underscores in 'digit' part - return 0 - } - #assert str only has solo inner underscores (if any) between 'digits' - #------------------------------------------- - - set numeric_value [tcl::string::map {_ ""} $str] ;#allow some earlier tcl versions which don't support underscores - #use Tcl's integer check to ensure we don't let things like 3e4 through - which is a float (would need to be 0x3e4 for hex) - if {![tcl::string::is integer -strict $numeric_value]} { - return 0 - } - - - - #!todo - check bounds only based on some config value - #even though Tcl can handle bignums, we won't accept anything outside of toml 1.0 minimum requirements by default (for now) - #presumably very large numbers would have to be supplied in a toml file as strings. - #Review - toml 1.0 only says that it must handle up to 2^63 - not that this is a max - #some question around implementations allowed to use lower values such as 2^31 on some systems? - if {$::tomlish::max_int ne "" && $numeric_value > $::tomlish::max_int} { - return 0 - } - if {$::tomlish::min_int ne "" && $numeric_value < $::tomlish::min_int} { - return 0 - } - } else { - return 0 - } - #Got this far - didn't find anything wrong with it. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'float'. - proc float_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[eE0-9\_\-\+\.]} $str] == $numchars} { - return 1 - } else { - #only allow lower case for these special values - as per Toml 1.0 spec - if {$str ni {inf +inf -inf nan +nan -nan}} { - return 0 - } else { - return 1 - } - } - } - - #note - Tcl's string is double will return true also for the subset of float values which are integers - #This function is to determine whether it matches the Toml float concept - so requires a . or e or E - proc is_float {str} { - #vip greenlight known literals, don't test for case variations - as Toml doesn't allow (whereas Tcl allows Inf NaN etc) - if {$str in {inf +inf -inf nan +nan -nan}} { - return 1 - } - #doorcheck the basics for floatiness vs members of that rival gang - ints - if {![regexp {[.eE]} $str]} { - #could be an integer - which isn't specifically a float for Toml purposes. - return 0 - } - - - #patdown for any contraband chars - set matches [regexp -all {[eE0-9\_\-\+\.]} $str] - if {[tcl::string::length $str] != $matches} { - return 0 - } - - #all characters in legal range - - #A leading zero is ok, but we should disallow multiple leading zeroes (same rules as toml ints) - - #Early Toml spec also disallowed leading zeros in the exponent part(?) - #... this seems less interoperable anyway (some libraries generate leading zeroes in exponents) - #we allow leading zeros in exponents here. - - #Check for leading zeros in main part - #first strip any +, - or _ (just for this test) - set check [tcl::string::map {+ "" - "" _ ""} $str] - set r {([0-9])*} - regexp $r $check intpart ;#intpart holds all numerals before the first .,e or E - set z {([0])*} - regexp $z $intpart leadingzeros - if {[tcl::string::length $leadingzeros] > 1} { - return 0 - } - - #for floats, +,- may occur in multiple places - #e.g -2E-22 +3e34 - #!todo - check bounds ? - - #----------------------------------------- - if {[regexp {__} $str]} { - return 0 - } - if {[string index $str 0] eq "_" || [string index $str end] eq "_"} { - return 0 - } - set utest [string trimleft $str +-] - #test again for further trick like _+_ - if {[string index $utest 0] eq "_"} { - return 0 - } - #----------------------------------------- - - #decimal point, if used must be surrounded by at least one digit on each side - #e.g 3.e+20 also illegal - set dposn [string first . $str] - if {$dposn > -1 } { - set d3 [string range $str $dposn-1 $dposn+1] - if {![string is integer -strict [string index $d3 0]] || ![string is integer -strict [string index $d3 2]]} { - return 0 - } - } - #we've already eliminated leading/trailing underscores - #now ensure each inner underscore is surrounded by digits - if {[regexp {_[^0-9]|[^0-9]_} $str]} { - return 0 - } - - #strip underscores for tcl double check so we can support < tcl 9 versions which didn't allow underscores - set check [tcl::string::map {_ ""} $str] - #string is double accepts inf nan +NaN etc. - if {![tcl::string::is double $check]} { - return 0 - } - - #All good - seems to be a toml-approved float and not an int. - return 1 - } - - #test only that the characters in str are valid for the toml specified type 'datetime'. - proc datetime_validchars {str} { - set numchars [tcl::string::length $str] - if {[regexp -all {[zZtT0-9\-\+\.:]} $str] == $numchars} { - return 1 - } else { - return 0 - } - } - - proc is_datepart {str} { - set matches [regexp -all {[0-9\-]} $str] - if {[tcl::string::length $str] != $matches} { - return 0 - } - #seems to require yyyy-mm-dd (e.g not allowing just yyyy-mm) - if {![regexp {^([0-9]{4})-([0-9]{2})-([0-9]{2})$} $str _match y m d]} { - return 0 - } - if {$m > 12 || $m == 0} { - return 0 - } - switch -- [expr {$m}] { - 1 - 3 - 5 - 7 - 8 - 10 - 12 { - if {$d > 31 || $d == 0} { - return 0 - } - } - 2 { - if {$d > 29 || $d == 0} { - return 0 - } - if {$d == 29} { - #leapyear check - if {[catch {clock scan $str -format %Y-%m-%d} errM]} { - return 0 - } - } - } - 4 - 6 - 9 - 11 { - if {$d > 30 || $d == 0} { - return 0 - } - } - } - return 1 - } - proc is_localdate {str} { - is_datepart $str - } - proc is_timepart {str} { - set numchars [tcl::string::length $str] - #timepart can have negative or positive offsets so - and + must be accepted - if {[regexp -all {[zZt0-9\-\+\.:]} $str] == $numchars} { - #todo - #basic check that we have leading 2dig hr and 2dig min separated by colon - if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}[^0-9]{1}.*$} $str]} { - #nn:nn or nn:nnX.* where X is non digit - return 0 - } - return 1 - } else { - return 0 - } - } - proc is_localtime {str} { - #time of day without any relation to a specific day or any offset or timezone - set numchars [tcl::string::length $str] - if {[regexp -all {[0-9\.:]} $str] == $numchars} { - #todo - if {![regexp {^[0-9]{2}:[0-9]{2}$|^[0-9]{2}:[0-9]{2}:[0-9]{2}([.][0-9]*){0,1}$} $str]} { - #hh:mm or hh:mm:ss or hh:mm::ss.nnn - return 0 - } - return 1 - } else { - return 0 - } - } - - #review - proc is_datetime {str} { - #Essentially RFC3339 formatted date-time - but: - #1) allowing seconds to be omitted (:00 assumed) - #2) T may be replaced with a single space character TODO - parser support for space in datetime! - # (RFC 3339 allows space instead of T also - but doesn't specify it *must* be a single space) - - #toml-lint @2025-04 doesn't accept t for T or z for Z - but RFC3339 does - #toml spec doesn't clarify - we will accept - - #e.g 1979-05-27 - #e.g 1979-05-27T00:32:00Z - #e.g 1979-05-27 00:32:00-07:00 - #e.g 1979-05-27 00:32:00+10:00 - #e.g 1979-05-27 00:32:00.999999-07:00 - - #review - #minimal datetimes? - # 2024 not ok - 2024T not accepted by tomlint why? - # 02:00 ok - # 02:00:00.5 ok - # 1:00 - not ok - RFC3339 requires 2-digit hr,min,sec - - #toml-lint.com accepts 2025-01 - - if {[string length $str] < 5} { - return 0 - } - - set matches [regexp -all {[zZtT0-9\ \-\+\.:]} $str] - if {[tcl::string::length $str] == $matches} { - #all characters in legal range - if {[regexp -all {\ } $str] > 1} { - #only a single space is allowed. - return 0 - } - #If we get a space - it is only valid as a convience to represent the T separator - #we can normalize by converting to T here before more tests - set str [string map {" " T t T} $str] - #a further sanity check on T - if {[regexp -all {T} $str] > 1} { - return 0 - } - - #!todo - use full RFC 3339 parser? - #!todo - what if the value is 'time only'? - - if {[string first T $str] > -1} { - lassign [split $str T] datepart timepart - if {![is_datepart $datepart]} { - return 0 - } - if {![is_timepart $timepart]} { - return 0 - } - } else { - #either a datepart or a localtime - #spec: "If you include only the time portion of an RFC 3339 formatted date-time, it will represent that time of day - # without any relation to a specific day or any offset or timezone." - if {!([is_datepart $str] || [is_localtime $str])} { - return 0 - } - } - - - #Tcl's free-form clock scan (no -format option) is deprecated - # - #if {[catch {clock scan $datepart} err]} { - # puts stderr "tcl clock scan failed err:'$err'" - # return 0 - #} - - } else { - return 0 - } - return 1 - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::utils ---}] -} - -namespace eval tomlish::parse { - #*** !doctools - #[subsection {Namespace tomlish::parse}] - #[para] - #[list_begin definitions] - - #This is a somewhat curly mix of a statemachine and toml-nesting-stack littered with special cases. - #The code is a pig's-nest - but it should be noted that for example trailing single double quotes in multiline strings are perhaps not so trivial to parse using more standard methods either: - # - e.g some kind of backtracking required if using an ABNF parser? - #I don't know the precise technical name for this sort of parser; probably something like "Dog's Breakfast" - #More seriously, we don't have distinct lex/parse steps - so it is basically a 'fused lexer' or 'scannerless parser' - - #It is also desirable for this system to be useful in 'interactive' use. review - would a separate lexer make this easier or harder? - - #A possible alternative more structured approach might be to use a PEG (Parsing Expression Grammar) - - - variable is_parsing 0 ;#whether we are in the middle of parsing tomlish text - - variable state - # states: - # table-space, itable-space, array-space - # array-value-expected,keyval-value-expected,itable-keyval-value-expected, keyval-syntax, - # dquoted-key, squoted-key - # string-state, literal-state, multistring... - # - # notes: - # only the -space states are also 'spaces' ie a container which is pushed/popped on the spacestack - - # - # xxx_value-expected - we also allow for leading whitespace in this state, but once a value is returned we jump to a state based on the containing space. e.g keyval-tail or array-syntax - # - #stateMatrix defines for each state, actions to take for each possible token. - #single-element actions are the name of the next state into which to transition, or a 'POPSPACE' instruction to pop a level off the spacestack and add the data to the parent container. - #dual-element actions are a push instruction and the name of the space to push on the stack. - # - PUSHSPACE is a simple push onto the spacestack, zeropoppushspace also pushes, but will first do a pop *if* the current space level is greater than zero (ie if only if not already in root table-space) - - # -- --- --- --- --- --- - #token/state naming guide - # -- --- --- --- --- --- - #tokens : underscore separated or bare name e.g newline, start_quote, start_squote - #private tokens: always have a leading underscore (These are private 'temporary state' tokens that are never returned as actual tokens e.g _start_squote_sequence - #states : always contain at least one dash e.g err-state, table-space - #instructions - # -- --- --- --- --- --- - - - #stateMatrix dict of elements mapping current state to next state based on returned tokens - # current-state {token-encountered next-state ... } - # where next-state can be a 1 or 2 element list. - #If 2 element - the first item is an instruction (ucase) - #If 1 element - it is either a lowercase dashed state name or an ucase instruction - #e.g {PUSHSPACE } or POPSPACE or SAMESPACE - - - #SAMESPACE - got to same space as parent without popping a level, but has it's own autotransition lookup - strange concept - review usecases - - variable stateMatrix - set stateMatrix [dict create] - #--------------------------------------------------------- - #WARNING - #The stateMatrix implementation here is currently messy. - #The code is a mixture of declarative via the stateMatrix and imperative via switch statements during PUSH/POP/SAMESPACE transitions. - #This means the state behaviour has to be reasoned about by looking at both in conjuction. - #--------------------------------------------------------- - - #xxx-space vs xxx-syntax inadequately documented - TODO - - #review - out of date? - # --------------------------------------------------------------------------------------------------------------# - # incomplete example of some state starting at table-space - # --------------------------------------------------------------------------------------------------------------# - # ( = -> keyval-value-expected) - # keyval-syntax (popped -> keyval-space -> keyval-tail) (autotransition on pop) - # keyval-space (autotransition on push ^) - # table-space (barekey^) (startdquote -> dquoted-key ^) - # --------------------------------------------------------------------------------------------------------------# - - dict set stateMatrix\ - table-space { - bom "table-space"\ - whitespace "table-space"\ - newline "table-space"\ - barekey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - squotedkey {PUSHSPACE "keyval-space" state "keyval-syntax" note ""}\ - dquotedkey {PUSHSPACE "keyval-space" state "keyval-syntax"}\ - XXXsingle_dquote "quoted-key"\ - XXXsingle_squote "squoted-key"\ - comment "table-space"\ - starttablename "tablename-state"\ - starttablearrayname "tablearrayname-state"\ - enddquote "err-state"\ - endsquote "err-state"\ - comma "err-state"\ - eof "end-state"\ - equal "err-state"\ - cr "err-lonecr"\ - } - - - - dict set stateMatrix\ - keyval-space {\ - whitespace "keyval-syntax"\ - equal "keyval-value-expected"\ - } - - # ' = ' portion of keyval - dict set stateMatrix\ - keyval-syntax {\ - whitespace "keyval-syntax"\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal "keyval-value-expected"\ - comma "err-state"\ - newline "err-state"\ - eof "err-state"\ - } - #### - dict set stateMatrix\ - keyval-value-expected {\ - whitespace "keyval-value-expected"\ - untyped_value {TOSTATE "keyval-untyped-sequence" note "possible datetime datepart"}\ - literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ - string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ - single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ - triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - } - #double_squote {TOSTATE "keyval-tail" note "empty literal received when double squote occurs"} - - #untyped_value sequences without intervening comma are allowed for datepart timepart - #we will produce tomlish with missing SEPS and to_dict must validate whether 2 adjacent barekeys are valid - dict set stateMatrix\ - keyval-untyped-sequence {\ - whitespace "keyval-untyped-sequence"\ - untyped_value {TOSTATE "keyval-tail"}\ - literal {TOSTATE "keyval-tail" note "required for empty literal at EOF"}\ - string {TOSTATE "keyval-tail" note "required for empty string at EOF"}\ - single_dquote {TOSTATE "string-state" returnstate keyval-tail}\ - triple_dquote {PUSHSPACE "multistring-space" returnstate keyval-tail}\ - single_squote {TOSTATE "literal-state" returnstate keyval-tail note "usual way a literal is triggered"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate keyval-tail}\ - startinlinetable {PUSHSPACE itable-space returnstate keyval-tail}\ - startarray {PUSHSPACE array-space returnstate keyval-tail}\ - newline "POPSPACE"\ - comment "keyval-tail"\ - eof "end-state"\ - } - - #2025 - no leading-squote-space - only trailing-squote-space. - - dict set stateMatrix\ - keyval-tail {\ - whitespace "keyval-tail"\ - newline "POPSPACE"\ - comment "keyval-tail"\ - eof "end-state"\ - } - - - #itable-space/ curly-syntax : itables - # x={y=1,} - dict set stateMatrix\ - itable-space {\ - whitespace "itable-space"\ - newline "itable-space"\ - barekey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - squotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - dquotedkey {PUSHSPACE "itable-keyval-space" state "itable-keyval-syntax"}\ - endinlinetable "POPSPACE"\ - comma "err-state"\ - comment "itable-space"\ - eof "err-state"\ - } - #we don't get single_squote etc here - instead we get the resulting squotedkey token - - - # ??? review - something like this - # - # x={y =1,} - dict set stateMatrix\ - itable-keyval-syntax {\ - whitespace {TOSTATE "itable-keyval-syntax"}\ - barekey {PUSHSPACE "dottedkey-space"}\ - squotedkey {PUSHSPACE "dottedkey-space"}\ - dquotedkey {PUSHSPACE "dottedkey-space"}\ - equal {TOSTATE "itable-keyval-value-expected"}\ - newline "err-state"\ - eof "err-state"\ - } - - # x={y=1} - dict set stateMatrix\ - itable-keyval-space {\ - whitespace "itable-keyval-syntax"\ - equal {TOSTATE "itable-keyval-value-expected" note "required"}\ - } - - dict set stateMatrix\ - itable-keyval-value-expected {\ - whitespace "itable-keyval-value-expected"\ - untyped_value {TOSTATE "itable-val-tail" note ""}\ - single_dquote {TOSTATE "string-state" returnstate itable-val-tail}\ - triple_dquote {PUSHSPACE "multistring-space" returnstate itable-val-tail}\ - single_squote {TOSTATE "literal-state" returnstate itable-val-tail note "usual way a literal is triggered"}\ - triple_squote {PUSHSPACE "multiliteral-space" returnstate itable-val-tail}\ - startinlinetable {PUSHSPACE "itable-space" returnstate itable-val-tail}\ - startarray {PUSHSPACE "array-space" returnstate itable-val-tail}\ - } - #double_squote not currently generated by _start_squote_sequence - '' processed as single_squote to literal-state just like 'xxx' - # review - # double_squote {TOSTATE "itable-val-tail" note "empty literal received when double squote occurs"} - - - - # x={y=1,z="x"} - #POPSPACE is transition from itable-keyval-space to parent itable-space - dict set stateMatrix\ - itable-val-tail {\ - whitespace "itable-val-tail"\ - endinlinetable "POPSPACE"\ - comma "POPSPACE"\ - newline {TOSTATE "itable-val-tail" note "itable-space ??"}\ - comment "itable-val-tail"\ - eof "err-state"\ - } - # XXXnewline "POPSPACE" - # We shouldn't popspace on newline - as if there was no comma we need to stay in itable-val-tail - # This means the newline and subsequent whitespace, comments etc become part of the preceeding dottedkey record - #e.g - # x = { - # j=1 - # #comment within dottedkey j record - # , # comment unattached - # #comment unattached - # k=2 , #comment unattached - # l=3 #comment within l record - # , m=4 - # #comment associated with m record - # - # #still associated with m record - # } - ## - This doesn't quite correspond to what a user might expect - but seems like a consistent mechanism. - #The awkwardness is because there is no way to put in a comment that doesn't consume a trailing comma - #so we cant do: j= 1 #comment for j1 , - # and have the trailing comma recognised. - # - # To associate: j= 1, #comment for j1 - # we would need some extra processing . (not popping until next key ? extra state itable-sep-tail?) REVIEW - worth doing? - # - # The same issue occurs with multiline arrays. The most natural assumption is that a comment on same line after a comma - # is 'associated' with the previous entry. - # - # These comment issues are independent of the data dictionary being generated for conversion to json etc - as the comments don't carry through anyway, - # but are a potential oddity for manipulating the intermediate tomlish structure whilst attempting to preserve 'associated' comments - # (e.g reordering records within an itable) - #The user's intention for 'associated' isn't always clear and the specs don't really guide on this. - - - #dottedkey-space is not (currently) used within [tablename] or [[tablearrayname]] - #it is for keyval ie x.y.z = value - - #this is the state after dot - #we are expecting a complete key token or whitespace - #(initial entry to the space is by one of the keys - which will immediately go to dottedkey-space-tail) - dict set stateMatrix\ - dottedkey-space {\ - whitespace "dottedkey-space"\ - dotsep "err-state"\ - barekey "dottedkey-space-tail"\ - squotedkey "dottedkey-space-tail"\ - dquotedkey "dottedkey-space-tail"\ - newline "err-state"\ - comma "err-state"\ - comment "err-state"\ - equal "err-state"\ - } - - #dottedkeyend "POPSPACE" - #equal "POPSPACE"\ - - - #jmn 2025 - #we have 1 or more dottedkeys so far - need dotsep to add more, whitespace to maintain, equal to pop - dict set stateMatrix\ - dottedkey-space-tail {\ - whitespace "dottedkey-space-tail" - dotsep "dottedkey-space" - equal "POPSPACE"\ - eof "err-state"\ - newline "err-state"\ - } - - #-------------------------------------------------------------------------- - #scratch area - #from_toml {x=1} - # barekey tok - # table-space PUSHSPACE keyval-space state keyval-syntax - # - - - #-------------------------------------------------------------------------- - - - #REVIEW - #toml spec looks like heading towards allowing newlines within inline tables - #https://github.com/toml-lang/toml/issues/781 - - #2025 - multiline itables appear to be valid for 1.1 - which we are targeting. - #https://github.com/toml-lang/toml/blob/main/toml.md#inline-table - - #JMN2025 - #review comment "err-state" vs comment "itable-space" - see if TOML 1.1 comes out and allows comments in multiline ITABLES - #We currently allow multiline ITABLES (also with comments) in the tokenizer. - #if we want to disallow as per TOML 1.0 - we should do so when attempting to get structure? - - - #JMN REVIEW - #dict set stateMatrix\ - # array-space {\ - # whitespace "array-space"\ - # newline "array-space"\ - # untyped_value "SAMESPACE"\ - # startarray {PUSHSPACE "array-space"}\ - # endarray "POPSPACE"\ - # startinlinetable {PUSHSPACE itable-space}\ - # single_dquote "string-state"\ - # single_squote "literal-state"\ - # triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax note "seems ok 2024"}\ - # comma "array-space"\ - # comment "array-space"\ - # eof "err-state-array-space-got-eof"\ - # } - - ## array-space ## - set aspace [dict create] - dict set aspace whitespace "array-space" - dict set aspace newline "array-space" - #dict set aspace untyped_value "SAMESPACE" - dict set aspace untyped_value "array-syntax" - dict set aspace startarray {PUSHSPACE "array-space"} - dict set aspace endarray "POPSPACE" - dict set aspace single_dquote {TOSTATE "string-state" returnstate array-syntax} - dict set aspace triple_dquote {PUSHSPACE "multistring-space" returnstate array-syntax} - dict set aspace single_squote {TOSTATE "literal-state" returnstate array-syntax} - dict set aspace triple_squote {PUSHSPACE "multiliteral-space" returnstate array-syntax} - dict set aspace startinlinetable {PUSHSPACE itable-space} - #dict set aspace comma "array-space" - dict set aspace comment "array-space" - dict set aspace eof "err-state-array-space-got-eof" - dict set stateMatrix array-space $aspace - - #when we pop from an inner array we get to array-syntax - #e.g {x=[[]] ??? - set tarntail [dict create] - dict set tarntail whitespace "tablearrayname-tail" - dict set tarntail newline "err-state" - dict set tarntail comment "err-state" - dict set tarntail eof "err-state" - dict set tarntail endtablename "tablearray-tail" - dict set stateMatrix tablearrayname-tail $tarntail - - #review - somewhat counterintuitive...? - # [(starttablearrayname) (endtablearrayname] - # [(starttablename) (endtablename)] - - # [[xxx]] ??? - set tartail [dict create] - dict set tartail whitespace "tablearray-tail" - dict set tartail newline "table-space" - dict set tartail comment "tablearray-tail" - dict set tartail eof "end-state" - dict set stateMatrix tablearray-tail $tartail - - - - - - - dict set stateMatrix\ - end-state {} - - set knowntokens [list] - set knownstates [list] - dict for {state transitions} $stateMatrix { - if {$state ni $knownstates} {lappend knownstates $state} - dict for {tok instructions} $transitions { - if {$tok ni $knowntokens} {lappend knowntokens $tok} - } - } - dict set stateMatrix nostate {} - foreach tok $knowntokens { - dict set stateMatrix nostate $tok "err-nostate-received-token-$tok" - } - - - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #purpose - debugging? remove? - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - #build a list of 'push triggers' from the stateMatrix - # ie tokens which can push a new space onto spacestack - set push_trigger_tokens [list] - tcl::dict::for {s transitions} $stateMatrix { - tcl::dict::for {token transition_to} $transitions { - set instruction [lindex $transition_to 0] - switch -exact -- $instruction { - PUSHSPACE - zeropoppushspace { - if {$token ni $push_trigger_tokens} { - lappend push_trigger_tokens $token - } - } - } - } - } - ::tomlish::log::debug "push_trigger_tokens: $push_trigger_tokens" - # -- --- --- --- --- --- --- --- --- --- --- --- --- --- --- - - - - #This seems hacky... (deprecate in favour of explicit arguments to the instructions in stateMatrix?) - #spacePopTransitions, spacePushTransitions, spaceSameTransitions below for auto state redirections on POPSPACE,PUSHSPACE,SAMESPACE - - #mainly for the -space states: - #redirect to another state $c based on a state transition from $whatever to $b - # e.g "string {array-space array-syntax}" means when transitioning from string to array-space, jump to array-syntax instead. - #this is useful as we often don't know state $b. e.g when it is decided by 'POPSPACE' - - #use dict set to add values so we can easily add/remove/comment lines - - #Push to, next - #default first states when we push to these spaces - variable spacePushTransitions [dict create] - dict set spacePushTransitions keyval-space keyval-syntax - dict set spacePushTransitions itable-keyval-space itable-keyval-syntax - dict set spacePushTransitions array-space array-space - dict set spacePushTransitions table-space tablename-state - #dict set spacePushTransitions #itable-space itable-space - - #Pop to, next - variable spacePopTransitions [dict create] - dict set spacePopTransitions array-space array-syntax - - - #itable-keyval-space itable-val-tail - #review - #we pop to keyval-space from dottedkey-space or from keyval-value-expected? we don't always want to go to keyval-tail - #leave it out and make the POPSPACE caller explicitly specify it - #keyval-space keyval-tail - - variable spaceSameTransitions [dict create] - #JMN test - #dict set spaceSameTransitions array-space array-syntax - - #itable-keyval-space itable-val-tail - - - variable state_list ;#reset every tomlish::decode::toml - - namespace export tomlish toml - namespace ensemble create - - #goNextState has various side-effects e.g pushes and pops spacestack - #REVIEW - setting nest and v elements here is ugly - #todo - make neater, more single-purpose? - proc goNextState {tokentype tok currentstate} { - variable state - variable nest - variable v - - set prevstate $currentstate - - - variable spacePopTransitions - variable spacePushTransitions - variable spaceSameTransitions - - variable last_space_action "none" - variable last_space_type "none" - variable state_list - - set result "" - set starttok "" - - if {[dict exists $::tomlish::parse::stateMatrix $currentstate $tokentype]} { - set transition_to [dict get $::tomlish::parse::stateMatrix $currentstate $tokentype] - ::tomlish::log::debug "--->> goNextState tokentype:$tokentype tok:$tok currentstate:$currentstate : transition_to = $transition_to" - switch -exact -- [lindex $transition_to 0] { - POPSPACE { - set popfromspace_info [spacestack peek] - set popfromspace_state [dict get $popfromspace_info state] - spacestack pop - set parent_info [spacestack peek] - set type [dict get $parent_info type] - set parentspace [dict get $parent_info state] - - set last_space_action "pop" - set last_space_type $type - - if {[dict exists $parent_info returnstate]} { - set next [dict get $parent_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected to stored returnstate $next <<---" - } else { - ### - #review - do away with spacePopTransitions - which although useful to provide a default.. - # - involve error-prone configurations distant to the main state transition configuration in stateMatrix - if {[dict exists $::tomlish::parse::spacePopTransitions $parentspace]} { - set next [dict get $::tomlish::parse::spacePopTransitions $parentspace] - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace redirected state to $next (spacePopTransitions)<<---" - } else { - set next $parentspace - ::tomlish::log::info "--->> POPSPACE transition from $popfromspace_state to parent space $parentspace<<---" - } - } - set result $next - } - SAMESPACE { - set currentspace_info [spacestack peek] - ::tomlish::log::debug "--->> SAMESPACE got current space entry: $currentspace_info <<<<<" - set type [dict get $currentspace_info type] - set currentspace [dict get $currentspace_info state] - - if {[dict exists $currentspace_info returnstate]} { - set next [dict get $currentspace_info returnstate] - #clear the returnstate on current level - set existing [spacestack pop] - dict unset existing returnstate - spacestack push $existing ;#re-push modification - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected to stored returnstate $next" - } else { - if {[dict exists $::tomlish::parse::spaceSameTransitions $currentspace]} { - set next [dict get $::tomlish::parse::spaceSameTransitions $currentspace] - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace redirected state to $next (spaceSameTransitions)" - } else { - set next $currentspace - ::tomlish::log::info "--->> SAMESPACE transition to space $currentspace" - } - } - set result $next - } - zeropoppushspace { - if {$nest > 0} { - #pop back down to the root level (table-space) - spacestack pop - set parentinfo [spacestack peek] - set type [dict get $parentinfo type] - set target [dict get $parentinfo state] - - set last_space_action "pop" - set last_space_type $type - - #----- - #standard pop - set parentlevel [expr {$nest -1}] - lappend v($parentlevel) [set v($nest)] - incr nest -1 - #----- - } - #re-entrancy - - #set next [list PUSHSPACE [lindex $transition_to 1]] - set nexttokentype ${tokentype}2 ;#fake token type e.g tablename2 or tablearrayname2 - ::tomlish::log::debug "--->> zeropoppushspace goNextState RECURSE. calling goNextState $nexttokentype $currentstate" - set transition_info [::tomlish::parse::goNextState $nexttokentype $tok $currentstate] - set result [dict get $transition_info newstate] - } - PUSHSPACE { - set original_target [dict get $transition_to PUSHSPACE] - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - if {[dict exists $transition_to starttok]} { - set starttok [dict get $transition_to starttok] - } - spacestack push [dict create type space state $original_target] - - set last_space_action "push" - set last_space_type "space" - - if {[dict exists $transition_to state]} { - #an explicit state in the pushed space was requested in the stateMatrix - override the spacePushTransition (spacePushTransitions can be deprecated if we require explicitness?) - set next [dict get $transition_to state] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next by explicit 'state' entry" - } else { - #puts $::tomlish::parse::spacePushTransitions - if {[dict exists $::tomlish::parse::spacePushTransitions $original_target]} { - set next [dict get $::tomlish::parse::spacePushTransitions $original_target] - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target redirected state to $next (spacePushTransitions) " - } else { - set next $original_target - ::tomlish::log::info "--->> PUSHSPACE transition to space $original_target" - } - } - set result $next - } - TOSTATE { - if {[dict exists $transition_to returnstate]} { - #adjust the existing space record on the stack. - #struct::stack doesn't really support that - so we have to pop and re-push - #todo - investigate a custom stack implementation where we can efficiently lset the top of the stack - set currentspace [spacestack pop] - dict set currentspace returnstate [dict get $transition_to returnstate] - spacestack push $currentspace ;#return modified info to stack so when we POPSPACE the returnstate is available. - } - set result [dict get $transition_to TOSTATE] - } - default { - #simplified version of TOSTATE - set result [lindex $transition_to 0] ;#ignore everything but first word - } - } - } else { - ::tomlish::log::error "--->> No state transition defined from state $currentstate when tokentype $tokentype received" - set result "nostate" - } - lappend state_list [list tokentype $tokentype from $currentstate to $result] - set state $result - ::tomlish::log::notice "--->> STATE TRANSITION tokenType: '$tokentype' tok:$tok triggering '$currentstate' -> '$result' last_space_action:$last_space_action " - return [dict create prevstate $prevstate newstate $result space_action $last_space_action starttok $starttok] - } - - proc report_line {{line ""}} { - variable linenum - variable is_parsing - if {$is_parsing} { - if {$line eq ""} { - set line $linenum - } - return "Line Number: $line" - } else { - #not in the middle of parsing tomlish text - return nothing. - return "" - } - } - - #produce a *slightly* more readable string rep of the nest for puts etc. - proc nest_pretty1 {list} { - set prettier "{" - - foreach el $list { - if { [lindex $el 0] eq "NEWLINE"} { - append prettier "[list $el]\n" - } elseif {([llength $el] > 1) && ([lindex $el 0] in {KEY DQKEY SQKEY TABLE ARRAY})} { - append prettier [nest_pretty1 $el] - } else { - append prettier "[list $el] " - } - } - append prettier "}" - return $prettier - } - - proc set_tokenType {t} { - variable tokenType - variable tokenType_list - if {![info exists tokenType]} { - set tokenType "" - } - lappend tokenType_list $t - set tokenType $t - } - - proc switch_tokenType {t} { - variable tokenType - variable tokenType_list - lset tokenType_list end $t - set tokenType $t - } - - proc get_tokenType {} { - variable tokenType - return $tokenType - } - - - proc get_token_waiting {} { - variable token_waiting - return [lindex $token_waiting 0] - } - proc clear_token_waiting {} { - variable token_waiting - set token_waiting [list] - } - - #token_waiting is a list - but our standard case is to have only one - #in certain circumstances such as near eof we may have 2 - #the set_token_waiting function only allows setting when there is not already one waiting. - #we want to catch cases of inadvertently trying to set multiple - # - the reason being that the state transition triggered by the previous token may have invalidated the assumptions made when a token was added as waiting. - proc set_token_waiting {args} { - if {[llength $args] %2 != 0} { - error "tomlish set_token_waiting must have args of form: type value complete 0|1" - } - variable token_waiting - - if {[llength $token_waiting] && [dict get [lindex $token_waiting end] type] ne "eof"} { - #tokloop already set a token_waiting - but something (post tokloop processing?) is trying to set another - #we may need to remove the existing token_waiting and reset the tokloop index to the previous char so it's reprocessed in the possibly new context - #rather than attempt to make the right decision here - we raise an error and require the caller to check/handle it - set err "tomlish set_token_waiting already has token_waiting: [lindex $token_waiting 0]" - append err \n " - cannot add token_waiting: $args" - error $err - #set tomlish::parse::i [expr {[dict get $token_waiting startindex] -1}] - #set token_waiting [list] - } - - set waiting [dict create] - dict for {k v} $args { - switch -exact $k { - type - complete { - dict set waiting $k $v - } - value { - dict set waiting tok $v - } - startindex { - dict set waiting startindex $v - } - default { - error "tomlish set_token_waiting error - unrecognised key $k. known keys: [dict keys $args]" - } - } - } - if {![tcl::string::is boolean -strict [dict get $waiting complete]]} { - error "tomlish set_token_waiting error - 'complete' must be a boolean. got [dict get $waiting complete]" - } - if {![llength $token_waiting]} { - set token_waiting [list $waiting] - } else { - #an extra sanity-check that we don't have more than just the eof.. - if {[llength $token_waiting] > 1} { - set err "tomlish Unexpected. Existing token_waiting count > 1.\n" - foreach tw $token_waiting { - append err " $tw" \n - } - append err " - cannot add token_waiting: $waiting" - error $err - } - #last entry must be a waiting eof - set token_waiting [list $waiting [lindex $token_waiting end]] - } - return - } - - #returns 0 or 1 - #tomlish::parse::tok - #we attempt to do this without lookahead (potential use in streaming toml? for what benefit?) todo -final flag - # - the possible benefit is being able to more easily process in arbitrarily split chunks (although we would still have to watch crlf splitting ?) - # - interactive use? - - proc tok {s} { - variable nest - variable v - variable i - variable tok - variable type ;#character type - variable state ;#FSM - - - variable tokenType - variable tokenType_list - - - variable endToken - - variable lastChar - - variable braceCount - variable bracketCount - - - #------------------------------ - #Previous run found another (presumably single-char) token - #The normal case is for there to be only one dict in the list - #multiple is an exception - primarily for eof - variable token_waiting - if {[llength $token_waiting]} { - set waiting [lindex $token_waiting 0] - - set tokenType [dict get $waiting type] - set tok [dict get $waiting tok] - #todo: dict get $token_waiting complete - set token_waiting [lrange $token_waiting 1 end] - return 1 - } - #------------------------------ - - set resultlist [list] - set sLen [tcl::string::length $s] - - set slash_active 0 - set quote 0 - set c "" - for {} {$i < $sLen} {} { - if {$i > 0} { - set lastChar [tcl::string::index $s [expr {$i - 1}]] - } else { - set lastChar "" - } - - set c [tcl::string::index $s $i] - set cindex $i - set ctest [tcl::string::map {\{ lc \} rc \[ lb \] rb \" dq ' sq \\ bsl \r cr \n lf \t tab \uFEFF bom} $c] - tomlish::log::debug "- tokloop char <$ctest> index $i tokenType:$tokenType tok:<$tok>" - #puts "got char $c during tokenType '$tokenType'" - incr i ;#must incr here because we do returns inside the loop - - switch -exact -- $ctest { - # { - set had_slash $slash_active - set slash_active 0 - - if {$had_slash} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - #for multiliteral, multistring - data and/or end - incr i -1 - return 1 - } - _start_squote_sequence { - #pseudo token beginning with underscore - never returned to state machine - review - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i [tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed (see tomlish::utils::is_barekey). [tomlish::parse::report_line]" - } - whitespace { - # hash marks end of whitespace token - #do a return for the whitespace, set token_waiting - #set_token_waiting type comment value "" complete 1 - incr i -1 ;#leave comment for next run - return 1 - } - untyped_value { - #REVIEW! the spec isn't clear.. is whitespace after an int,bool etc required before comment? - #we will accept a comment marker as an immediate terminator of the untyped_value. - incr i -1 - return 1 - } - starttablename - starttablearrayname { - #fix! - error "tomlish Character '#' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare parts - but allowed in quoted parts - let tablename parser sort it out - append tok $c - } - default { - #dquotedkey, string,literal, multistring - append tok $c - } - } - } else { - switch -- $state { - multistring-space { - set_tokenType stringpart - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "#" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "#" - } - default { - #start of token if we're not in a token - set_tokenType comment - set tok "" ;#The hash is not part of the comment data - } - } - } - } - lc { - #left curly brace - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i [tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - #*bare* tablename can only contain letters,digits underscores - error "tomlish Invalid tablename first character \{ [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #valid in quoted parts - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\{" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - switch -exact -- $state { - itable-keyval-value-expected - keyval-value-expected { - #switch last key to tablename?? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - array-space - array-syntax { - #nested anonymous inline table - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "startinlinetable" - set tok "\{" - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\{" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\{" - } - default { - error "tomlish state: '$state'. left brace case not implemented [tomlish::parse::report_line]" - } - } - } - - } - rc { - #right curly brace - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - tablename { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endinlinetable value "" complete 1 startindex $cindex - return 1 - } - starttablearrayname - tablearrayname { - if {$had_slash} {append tok "\\"} - #invalid! - but leave for datastructure loading stage to catch - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } - default { - #end any other token - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-space { - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname-state problem" - set_tokenType "endinlinetable" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - array-syntax - array-space { - #invalid - set_tokenType "endinlinetable" - set tok "\}" - return 1 - } - itable-val-tail { - set_tokenType "endinlinetable" - set tok "" - #we need to pop the keyval - and then reprocess to pop the inlinetable - so we incr -1 - incr i -1 - return 1 - } - itable-keyval-syntax { - error "tomlish endinlinetable unexpected at this point. Expecting key=val syntax [tomlish::parse::report_line]" - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\}" - } - multiliteral-space { - set_tokenType "literalpart" ; #review - set tok "\}" - } - default { - #JMN2024b keyval-tail? - error "tomlish state '$state'. endinlinetable case not implemented [tomlish::parse::report_line]" - } - } - } - - } - lb { - #left square bracket - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename { - #change the tokenType - switch_tokenType "starttablearrayname" - set tok "" ;#no output into the tomlish list for this token - #any following whitespace is part of the tablearrayname, so return now - return 1 - } - tablename - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - #append tok "\\[" - append tok {\[} - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - #invalid at this point - state machine should disallow: - # table -> starttablearrayname - # tablearray -> starttablearrayname - set_token_waiting type starttablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "\[" - } - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok "\[" - } - default { - #end any other token. - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - keyval-value-expected - itable-keyval-value-expected { - set_tokenType "startarray" - set tok "\[" - return 1 - } - array-space - array-syntax { - #nested array? - set_tokenType "startarray" - set tok "\[" - return 1 - #error "state: array-space. startarray case not implemented [tomlish::parse::report_line]" - } - table-space { - #table name - #assume it's a single bracket - but we need to wait for non-bracket to confirm it's not a tablearray - #note that a starttablearrayname token may contain whitespace between the brackets - # e.g \[ \[ - set_tokenType "starttablename" - set tok "" ;#there is no output into the tomlish list for this token - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\[" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\[" - } - itable-space { - #handle state just to give specific error msg - error "tomlish state: '$state'. Left square bracket invalid. Cannot start array in inline table without key. Use key=\[\] syntax. [tomlish::parse::report_line]" - } - default { - error "tomlish state: '$state'. startarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - rb { - #right square bracket - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - #???? - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } else { - incr i -1 - if {$had_slash} {incr i -1} ;#reprocess - return 1 - } - } - tablename { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablename value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - tablearrayname { - #e.g a."x[0]".c is valid table name sequence - so we need to track quoting to know if rb is an end token - if {$had_slash} { - #resultant tablename may be invalid - but leave for datastructure loading stage to catch - append tok "\\]" - } else { - if {[tomlish::utils::tok_in_quotedpart $tok] eq ""} { - set_token_waiting type endtablearrayname value "" complete 1 startindex $cindex - return 1 - } else { - #we appear to still be in single or double quoted section - append tok "]" - } - } - } - default { - incr i -1 - return 1 - } - } - } else { - #$slash_active not relevant when no tokenType - switch -exact -- $state { - array-syntax - array-space { - #invalid - but allow parser statemachine to report it. - set_tokenType "endarray" - set tok "\]" - return 1 - } - table-space { - #invalid - but allow parser statemachine to report it. ? - set_tokenType "endarray" - set tok "\]" - return 1 - } - tablename-state { - #e.g [] - empty tablename - allowed or not? - #empty tablename/tablearrayname ? - #error "unexpected tablename problem" - - set_tokenType "endtablename" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-state { - error "tomlish unexpected tablearrayname problem" - set_tokenType "endtablearray" - set tok "" ;#no output into the tomlish list for this token - return 1 - } - tablearrayname-tail { - #[[xxx] - set_tokenType "endtablename" - #sequence: starttablename -> starttablearrayname -> endtablearrayname -> endtablename - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok "\]" - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\]" - } - default { - error "tomlish state '$state'. endarray case not implemented [tomlish::parse::report_line]" - } - } - } - } - bsl { - #backslash - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - #end whitespace token - incr i -1 ;#reprocess bsl in next run - return 1 - } else { - error "tomlish Unexpected backslash during whitespace. [tomlish::parse::report_line]" - } - } - literal - literalpart - squotedkey { - #never need to set slash_active true when in single quoted tokens - append tok "\\" - set slash_active 0 - } - string - dquotedkey - comment { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - stringpart { - if {$slash_active} { - #assert - quotes empty - or we wouldn't have slash_active - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - starttablename - starttablearrayname { - error "tomlish backslash is invalid as first character of $tokenType [tomlish::parse::report_line]" - } - tablename - tablearrayname { - if {$slash_active} { - set slash_active 0 - append tok "\\\\" - } else { - set slash_active 1 - } - } - barekey { - error "tomlish Unexpected backslash during barekey. [tomlish::parse::report_line]" - } - default { - error "tomlish Backslash unexpected during tokentype: '$tokenType'. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$slash_active} { - set_tokenType "stringpart" - set tok "\\\\" - set slash_active 0 - } else { - set slash_active 1 - } - } - multiliteral-space { - #nothing can be escaped in multiliteral-space - not even squotes (?) review - set_tokenType "literalpart" - set tok "\\" - } - default { - error "tomlish tok error: Unexpected backslash when no token is active. [tomlish::parse::report_line]" - } - } - } - } - sq { - #single quote - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote { - #for within multiliteral - #short tentative_accum_squote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_squote is multisquote + 2 = 5 - #return tok with value ''''' - return 1 - } - } - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multiliteral - #switch? - append tok $c - set_tokenType triple_squote - return 1 - } - default { - #if there are more than 3 leading squotes we also enter multiliteral space and the subsequent ones are handled - #by the tentative_accum_squote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 squotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_squote_sequence'" - } - } - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - #end whitespace - incr i -1 ;#reprocess sq - return 1 - } - literal { - #slash_active always false - #terminate the literal - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - literalpart { - #ended by ''' - but final could be '''' or ''''' (up to 2 squotes allowed directly before ending triple squote sequence) - #todo - # idea: end this literalpart (possibly 'temporarily') - # let the sq be reprocessed in the multiliteral-space to push an end-multiliteral-sequence to state stack - # upon popping end-multiliteral-sequence - stitch quotes back into this literalpart's token (if either too short - or a long ending sequence as shown above) - incr i -1 ;#throw the "'" back to loop - will be added to a tentative_accum_squote token for later processing - return 1 - } - XXXitablesquotedkey { - set_token_waiting type endsquote value "'" complete 1 startindex $cindex - return 1 - } - squotedkey { - ### - #set_token_waiting type endsquote value "'" complete 1 - return 1 - } - starttablename - starttablearrayname { - #!!! - incr i -1 - return 1 - } - tablename - tablearrayname { - append tok $c - } - barekey { - #barekeys now support all sorts of unicode letter/number chars for other cultures - #but not punctuation - not even for those of Irish heritage who don't object - #to the anglicised form of some names. - # o'shenanigan seems to not be a legal barekey - #The Irish will have to use an earlier form Ó - which apparently many may prefer anyway. - error "tomlish Unexpected single quote during barekey. [tomlish::parse::report_line]" - } - default { - append tok $c - } - } - } else { - switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading squote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_squote token or triple_squote token - #It currently doesn't trigger double_squote token - #(handle '' same as 'x' ie produce a single_squote and go into processing literal) - #review - producing double_squote for empty literal may be slightly more efficient. - #This token is not used to handle squote sequences *within* a multiliteral - set_tokenType "_start_squote_sequence" - set tok "'" - } - multiliteral-space { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_squote" ;#trigger tentative_accum_squote - set tok "'" - return 1 - } - table-space - itable-space { - #tests: squotedkey.test squotedkey_itable.test - set_tokenType "squotedkey" - set tok "" - } - XXXtable-space - XXXitable-space { - #future - could there be multiline keys? MLLKEY, MLBKEY ? - #this would (almost) allow arbitrary tcl dicts to be stored in toml (aside from escaping issues) - #probably unlikely - as it's perhaps not very 'minimal' or ergonomic for config files - #@2025 ABNF for toml mentions key, simple-key, unquoted-key, quoted-key and dotted-key - #where key is simple-key or dotted-key - no MLL or MLB components - #the spec states solution for arbitrary binary data is application specific involving encodings - #such as hex, base64 - set_tokenType "_start_squote_sequence" - set tok "'" - return 1 - } - tablename-state { - #first char in tablename-state/tablearrayname-state - set_tokenType "tablename" - append tok "'" - } - tablearrayname-state { - set_tokenType "tablearrayname" - append tok "'" - } - literal-state { - #shouldn't get here? review - tomlish::log::debug "- tokloop sq during literal-state with no tokentype - empty literal?" - set_tokenType "literal" - incr -1 - return 1 - } - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - #error "tomlish unimplemented - squote during state '$state'. [tomlish::parse::report_line]" - } - dottedkey-space { - set_tokenType "squotedkey" - } - default { - error "tomlish unhandled squote during state '$state'. [tomlish::parse::report_line]" - } - } - } - - } - dq { - #double quote - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - tentative_accum_dquote { - #within multistring - #short tentative_accum_dquote tokens are returned if active upon receipt of any other character - #longest allowable for leading/trailing are returned here - #### - set existingtoklen [tcl::string::length $tok] ;#toklen prior to this squote - #assert state = trailing-squote-space - append tok $c - if {$existingtoklen == 4} { - #maxlen to be a tentative_accum_dquote is multidquote + 2 = 5 - #return tok with value """"" - return 1 - } - } - _start_dquote_sequence { - #pseudo/temp token creatable during keyval-value-expected itable-keyval-value-expected or array-space - switch -- [tcl::string::length $tok] { - 1 { - #no conclusion can yet be reached - append tok $c - } - 2 { - #enter multistring - #switch? - append tok $c - set_tokenType triple_dquote - return 1 - } - default { - #if there are more than 3 leading dquotes we also enter multistring space and the subsequent ones are handled - #by the tentative_accum_dquote check for ending sequence which can accept up to 5 and reintegrate the - #extra 1 or 2 dquotes as data. - error "tomlish unexpected token length [tcl::string::length $tok] in '_start_dquote_sequence'" - } - } - } - literal - literalpart { - append tok $c - } - string { - if {$had_slash} { - append tok "\\" $c - } else { - #unescaped quote always terminates a string - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex - return 1 - } - } - stringpart { - #sub element of multistring - if {$had_slash} { - append tok "\\" $c - } else { - incr i -1 ;#throw the {"} back to loop - will be added to a tentative_accum_dquote token for later processing - return 1 - } - } - whitespace { - #assert: had_slash will only ever be true in multistring-space - if {$had_slash} { - incr i -2 - return 1 - } else { - #end whitespace token - throw dq back for reprocessing - incr i -1 - return 1 - } - } - comment { - if {$had_slash} {append tok "\\"} - append tok $c - } - XXXdquotedkey { - if {$had_slash} { - append tok "\\" - append tok $c - } else { - set_token_waiting type enddquote value "\"" complete 1 startindex $cindex - return 1 - } - } - dquotedkey { - ### - if {$had_slash} { - append tok "\\" - append tok $c - } else { - #set_token_waiting type enddquote value {"} complete 1 - return 1 - } - } - squotedkey { - append tok $c - } - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok $c - } - starttablename - starttablearrayname { - incr i -1 ;## - return 1 - } - default { - error "tomlish got quote during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - #$slash_active not relevant when no tokenType - #token is string only if we're expecting a value at this point - switch -exact -- $state { - array-space - keyval-value-expected - itable-keyval-value-expected { - #leading dquote - #pseudo-token _start_squote_sequence ss not received by state machine - #This pseudotoken will trigger production of single_dquote token or triple_dquote token - #It currently doesn't trigger double_dquote token - #(handle "" same as "x" ie produce a single_dquote and go into processing string) - #review - producing double_dquote for empty string may be slightly more efficient. - #This token is not used to handle dquote sequences once *within* a multistring - set_tokenType "_start_dquote_sequence" - set tok {"} - } - multistring-space { - if {$had_slash} { - set_tokenType "stringpart" - set tok "\\\"" - } else { - #each literalpart is not necessarily started/ended with squotes - but may contain up to 2 in a row - #we are building up a tentative_accum_squote to determine if - #a) it is shorter than ''' so belongs in a literalpart (either previous, subsequent or it's own literalpart between newlines - #b) it is exactly ''' and we can terminate the whole multiliteral - #c) it is 4 or 5 squotes where the first 1 or 2 beling in a literalpart and the trailing 3 terminate the space - set_tokenType "tentative_trigger_dquote" ;#trigger tentative_accum_dquote - set tok {"} - return 1 - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok "\"" - } - table-space - itable-space { - set_tokenType "dquotedkey" - set tok "" - } - dottedkey-space { - set_tokenType dquotedkey - set tok "" - - #only if complex keys become a thing - #set_tokenType dquote_seq_begin - #set tok $c - } - tablename-state { - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - default { - error "tomlish Unexpected dquote during state '$state' [tomlish::parse::report_line]" - } - } - } - } - = { - set had_slash $slash_active - set slash_active 0 - - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart - squotedkey { - #assertion had_slash 0 - append tok $c - } - string - comment - dquotedkey { - #for these tokenTypes an = is just data. - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type equal value = complete 1 startindex $cindex - return 1 - } - } - barekey { - #set_token_waiting type equal value = complete 1 - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character '=' is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #invalid in bare name - but valid in quoted parts - leave for tablename parser to sort out - append tok $c - } - default { - error "tomlish unexpected = character during tokentype $tokenType. case not implemented. [tomlish::parse::report_line]" - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} { - append tok "\\" - } - append tok = - } - multiliteral-space { - set_tokenType "literalpart" - set tok "=" - } - dottedkey-space { - set_tokenType "equal" - set tok "=" - return 1 - } - default { - set_tokenType "equal" - set tok = - return 1 - } - } - } - } - cr { - #REVIEW! - # \r carriage return - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #we have received a double cr - ::tomlish::log::warn "double cr - will generate cr token. needs testing" - set_tokenType "cr" ;#lone cr token will generally raise an error - but let state machine handle it - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - append tok $c - } - literalpart { - #part of MLL string (multi-line literal string) - #we need to split out crlf as a separate NEWLINE to be consistent - ::tomlish::log::warn "literalpart ended by cr - needs testing" - #return literalpart temporarily - allow cr to be reprocessed from multiliteral-space - incr i -1 - return 1 - } - stringpart { - #stringpart is a part of MLB string (multi-line basic string) - #throw back the cr - if followed by lf it will become a {NEWLINE crlf} entry within the MULTISTRING list (e.g between STRINGPART entries) - incr i -1 - return 1 - } - starttablename - starttablearrayname { - error "tomlish Character is invalid first character for $tokenType. [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #could in theory be valid in quoted part of name - #review - might be better just to disallow here - append tok $c - } - whitespace { - #it should technically be part of whitespace if not followed by lf - #but outside of values we are also free to map it to be another NEWLINE instead? REVIEW - incr i -1 - return 1 - } - untyped_value { - incr i -1 - return 1 - } - default { - #!todo - error out if cr inappropriate for tokenType - append tok $c - } - } - } else { - #lf may be appended if next - #review - lone cr as newline? - this is uncommon - but so is lone cr in a string(?) - set_tokenType "newline" - set tok cr - } - } - lf { - # \n newline - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #review - #this lf is the trailing part of a crlf - append tok lf ;#assert we should now have tok "crlf" - as a previous cr is the only way to have an incomplete newline tok - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - #multiliteral or multistring - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal { - #nl is not allowed *within* a literal - require multiliteral syntax for any literal containing a newline ''' ''' - #even though we terminate the literal without the closing quote here - the token_waiting newline should trigger a state error - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - literalpart { - #we allow newlines - but store them within the multiliteral as their own element - #This is a legitimate end to the literalpart - but not the whole multiliteral - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - stringpart { - if {$had_slash} { - #emit the stringpart (return 1), queue the continuation, go back 1 to reprocess the lf (incr i -1) - set_token_waiting type continuation value \\ complete 1 startindex [expr {$cindex-1}] - incr i -1 - return 1 - } else { - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - starttablename - tablename - tablearrayname - starttablearrayname { - error "tomlish Character is invalid in $tokenType. [tomlish::parse::report_line]" - } - default { - #newline ends all other tokens. - #note for string: we don't add (raw unescaped) newline to simple string. (must use multi-string for this) - #note for whitespace: - # we will use the convention that \n terminates the current whitespace even if whitespace follows - # ie whitespace is split into separate whitespace tokens at each newline - - #puts "-------------- newline lf during tokenType $tokenType" - set_token_waiting type newline value lf complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "newline" - set tok lf - return 1 - } - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "newline" - set tok "lf" - return 1 - } - default { - #ignore slash? error? - set_tokenType "newline" - set tok lf - return 1 - } - } - #if {$had_slash} { - # #CONT directly before newline - allows strings_5_byteequivalent test to pass - # set_tokenType "continuation" - # set tok "\\" - # incr i -1 - # return 1 - #} else { - # set_tokenType newline - # set tok lf - # return 1 - #} - } - } - , { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - tablename - tablearrayname { - if {$had_slash} {append tok "\\"} - append tok , - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - #stringpart can have up to 2 quotes too - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - set_token_waiting type comma value "," complete 1 startindex $cindex - return 1 - } - } - default { - set_token_waiting type comma value "," complete 1 startindex $cindex - if {$had_slash} {append tok "\\"} - return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "," - } - multiliteral-space { - #assert had_slash 0 - set_tokenType "literalpart" - set tok "," - } - default { - set_tokenType "comma" - set tok "," - return 1 - } - } - } - } - . { - set had_slash $slash_active - set slash_active 0 - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - comment - untyped_value { - if {$had_slash} {append tok "\\"} - append tok $c - } - string - dquotedkey { - if {$had_slash} {append tok "\\"} - append tok $c - } - stringpart { - if {$had_slash} {append tok "\\"} - append tok $c - } - literal - literalpart - squotedkey { - #assert had_slash always 0 - append tok $c - } - whitespace { - switch -exact -- $state { - multistring-space { - #review - if {$had_slash} { - incr i -2 - } else { - incr i -1 - } - return 1 - } - xxxdottedkey-space { - incr i -1 - return 1 - } - dottedkey-space-tail { - incr i -1 - return 1 - } - default { - error "tomlish Received period during tokenType 'whitespace' [tomlish::parse::report_line]" - } - } - } - starttablename - starttablearrayname { - #This would correspond to an empty table name - error "tomlish Character '.' is not allowed as first character ($tokenType). [tomlish::parse::report_line]" - } - tablename - tablearrayname { - #subtable - split later - review - append tok $c - } - barekey { - #e.g x.y = 1 - #we need to transition the barekey to become a structured table name ??? review - #x is the tablename y is the key - set_token_waiting type dotsep value "." complete 1 startindex $cindex - return 1 - } - default { - error "tomlish Received period during tokenType '$tokenType' [tomlish::parse::report_line]" - #set_token_waiting type period value . complete 1 - #return 1 - } - } - } else { - switch -exact -- $state { - multistring-space { - set_tokenType "stringpart" - set tok "" - if {$had_slash} {append tok "\\"} - append tok "." - } - multiliteral-space { - set_tokenType "literalpart" - set tok "." - } - XXXdottedkey-space { - ### obs? - set_tokenType "dotsep" - set tok "." - return 1 - } - dottedkey-space-tail { - ### - set_tokenType "dotsep" - set tok "." - return 1 - } - default { - set_tokenType "untyped_value" - set tok "." - } - } - } - - } - " " { - if {[tcl::string::length $tokenType]} { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #todo had_slash - emit token or error - #whitespace is a terminator for bare keys - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - comment { - if {$had_slash} { - append tok "\\" - } - append tok $c - } - string - dquotedkey { - if {$had_slash} { append tok "\\" } - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART xxx WS " " - incr i -1 - return 1 - } - } - literal - literalpart - squotedkey { - append tok $c - } - whitespace { - if {$state eq "multistring-space"} { - append tok $c - } else { - append tok $c - } - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace space during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok "" - if {$had_slash} {append tok "\\"} - append tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType "whitespace" - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - if {$had_slash} { - error "tomlish unexpected backslash [tomlish::parse::report_line]" - } - set_tokenType "whitespace" - append tok $c - } - } - } - } - tab { - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out (?review) - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - barekey { - #whitespace is a terminator for bare keys - incr i -1 - #set_token_waiting type whitespace value $c complete 1 - return 1 - } - untyped_value { - #unquoted values (int,date,float etc) are terminated by whitespace - #set_token_waiting type whitespace value $c complete 1 - incr i -1 - return 1 - } - squotedkey { - append tok $c - } - dquotedkey - string - comment - whitespace { - #REVIEW - append tok $c - } - stringpart { - #for stringpart we store WS separately for ease of processing continuations (CONT stripping) - if {$had_slash} { - #REVIEW - #emit the stringpart - go back to the slash - incr i -2 - return 1 - } else { - #split into STRINGPART aaa WS " " - incr i -1 - return 1 - } - } - literal - literalpart { - append tok $c - } - starttablename - starttablearrayname { - incr i -1 - return 1 - } - tablename - tablearrayname { - #include whitespace in the tablename/tablearrayname - #Will need to be normalized upon interpreting the tomlish as a datastructure - append tok $c - } - default { - error "tomlish Received whitespace tab during tokenType '$tokenType' [tomlish::parse::report_line]" - } - } - } else { - set had_slash $slash_active - if {$slash_active} { - set slash_active 0 - } - switch -exact -- $state { - tablename-state { - #tablename can have leading,trailing and interspersed whitespace! - #These will not be treated as whitespace tokens, instead forming part of the name. - set_tokenType tablename - set tok $c - } - tablearrayname-state { - set_tokenType tablearrayname - set tok $c - } - multistring-space { - if {$had_slash} { - set_tokenType "continuation" - set tok "\\" - incr i -1 - return 1 - } else { - set_tokenType whitespace - append tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - default { - set_tokenType "whitespace" - append tok $c - } - } - } - } - bom { - #BOM (Byte Order Mark) - ignored by token consumer - if {[tcl::string::length $tokenType]} { - switch -exact -- $tokenType { - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - #assert - tok will be one or two squotes only - #A toml literal probably isn't allowed to contain this - #but we will parse and let the validator sort it out. - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - literal - literalpart { - append tok $c - } - string - stringpart { - append tok $c - } - default { - #state machine will generally not have entry to accept bom - let it crash - set_token_waiting type bom value "\uFEFF" complete 1 startindex $cindex - return 1 - } - } - } else { - switch -exact -- $state { - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - multistring-space { - set_tokenType "stringpart" - set tok $c - } - default { - set_tokenType "bom" - set tok "\uFEFF" - return 1 - } - } - } - } - default { - - if {[tcl::string::length $tokenType]} { - if {$slash_active} {append tok "\\"} ;#if tokentype not appropriate for \, we would already have errored out. - set slash_active 0 - switch -exact -- $tokenType { - newline { - #incomplete newline - set_tokenType "cr" - incr i -1 - return 1 - } - tentative_accum_squote - tentative_accum_dquote { - incr i -1 - return 1 - } - _start_squote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_squote" - return 1 - } - _start_dquote_sequence { - incr i -[tcl::string::length $tok] - set_tokenType "single_dquote" - return 1 - } - whitespace { - if {$state eq "multistring-space"} { - incr i -1 - return 1 - } else { - #review - incr i -1 ;#We don't have a full token to add to the token_waiting dict - so leave this char for next run. - return 1 - } - } - barekey { - if {[tomlish::utils::is_barekey $c]} { - append tok $c - } else { - error "tomlish Unexpected character '$c' during bare key. Only \[a-zA-Z0-9_-\] and a selection of letter-like chars allowed. (see tomlish::utils::is_barekey) [tomlish::parse::report_line]" - } - } - starttablename - starttablearrayname { - incr i -1 - #allow statemachine to set context for subsequent chars - return 1 - } - stringpart { - append tok $c - } - default { - #e.g comment/string/literal/literalpart/untyped_value/starttablename/starttablearrayname/tablename/tablearrayname - append tok $c - } - } - } else { - set had_slash $slash_active - set slash_active 0 - switch -exact -- $state { - table-space - itable-space { - #if no currently active token - assume another key value pair - if {[tomlish::utils::is_barekey $c]} { - set_tokenType "barekey" - append tok $c - } else { - error "tomlish Unexpected char $c ([tomlish::utils::nonprintable_to_slashu $c]) whilst no active tokenType. [tomlish::parse::report_line]" - } - } - multistring-space { - set_tokenType "stringpart" - if {$had_slash} { - set tok \\$c - } else { - set tok $c - } - } - multiliteral-space { - set_tokenType "literalpart" - set tok $c - } - tablename-state { - set_tokenType "tablename" - set tok $c - } - tablearrayname-state { - set_tokenType "tablearrayname" - set tok $c - } - dottedkey-space { - set_tokenType barekey - set tok $c - } - default { - #todo - something like ansistring VIEW to show control chars? - set cshow [string map [list \t tab \v vt] $c] - tomlish::log::debug "- tokloop char '$cshow' setting to untyped_value while state:$state [tomlish::parse::report_line]" - set_tokenType "untyped_value" - set tok $c - } - } - } - } - } - - } - - #run out of characters (eof) - if {[tcl::string::length $tokenType]} { - #check for invalid ending tokens - #if {$state eq "err-state"} { - # error "Reached end of data whilst tokenType = '$tokenType'. INVALID" - #} - switch -exact -- $tokenType { - _start_squote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open literal - error "tomlish eof reached without closing single quote for string literal. [tomlish::parse::report_line]" - } - 2 { - set_tokenType "literal" - set tok "" - return 1 - - ##review - #set_token_waiting type endsquote value "'" complete 1 startindex [expr {$cindex -1}] - #set_tokenType "literal" - #set tok "" - #return 1 - } - } - } - _start_dquote_sequence { - set toklen [tcl::string::length $tok] - switch -- $toklen { - 1 { - #invalid eof with open string - error "tomlish eof reached without closing double quote for string. [tomlish::parse::report_line]" - } - 2 { - set_tokenType "string" - set tok "" - return 1 - } - } - } - newline { - #The only newline token that has still not been returned should have a tok value of "cr" - puts "tomlish eof reached - with incomplete newline token '$tok'" - if {$tok eq "cr"} { - #we convert lone cr to it's own "cr" token elsewhere in the document to allow statemachine to handle it. - #(which it should generally do by not handling it ie raising an error - or emitting an ERROR list in the tomlish) - #if trailing char is a lone cr - we should encode it the same way as elsewhere that is outside of values - # ie as it's own token. - switch_tokenType "cr" - return 1 - } else { - #should be unreachable - error "tomlish eof reached - with invalid newline token. value: $tok" - } - } - } - set_token_waiting type eof value eof complete 1 startindex $i ;#review - return 1 - } else { - ::tomlish::log::debug "- No current tokenType, ran out of characters, setting tokenType to 'eof' [tomlish::parse::report_line]" - set tokenType "eof" - set tok "eof" - } - return 0 - } - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::parse ---}] -} - -namespace eval tomlish::dict { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - - proc is_tomlish_typeval {d} { - #designed to detect {type value } e.g {type INT value 3}, {type STRING value "blah etc"} - #as a sanity check we need to avoid mistaking user data that happens to match same form - #consider x.y={type="spud",value="blah"} - #The value of type will itself have already been converted to {type STRING value spud} ie never a single element. - #check the length of the type as a quick way to see it's a tag - not something else masqerading. - expr {[dict size $d] == 2 && [dict exists $d type] && [dict exists $d value] && [llength [dict get $d type]] == 1} - } - proc is_tomlish_typeval2 {d} { - upvar ::tomlish::tags tags - expr {[lindex $d 0] eq "type" && [lindex $d 1] in $tags} - } - proc last_tomltype_posn {d} { - set last_simple -1 - set dictposn [expr {[dict size $d] -1}] - foreach k [lreverse [dict keys $d]] { - set dval [dict get $d $k] - if {[is_tomlish_typeval $dval]} { - set last_simple $dictposn - break - } - incr dictposn -1 - } - return $last_simple - } - - - #review - proc name_from_tablestack {tablestack} { - set name "" - foreach tinfo [lrange $tablestack 1 end] { - lassign $tinfo type namepart - switch -- $type { - T { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - I { - if {$name eq ""} { - append name $namepart - } else { - append name .$namepart - } - } - default { - #end at first break in the leading sequence of T & I tablenames - break - } - } - } - return $name - } - - proc _show_tablenames {tablenames_info} { - append msg \n "tablenames_info:" \n - dict for {tkey tinfo} $tablenames_info { - append msg " " "table: $tkey" \n - dict for {field finfo} $tinfo { - append msg " " "$field $finfo" \n - } - } - return $msg - } -} -tcl::namespace::eval tomlish::to_dict { - proc tablename_split {tablename {normalize false}} { - #we can't just split on . because we have to handle quoted segments which may contain a dot. - #eg {dog."tater.man"} - set sLen [tcl::string::length $tablename] - set segments [list] - set mode "preval" ;#5 modes: preval, quoted,litquoted, unquoted, postval - #quoted is for double-quotes, litquoted is for single-quotes (string literal) - set seg "" - for {set i 0} {$i < $sLen} {incr i} { - - if {$i > 0} { - set lastChar [tcl::string::index $tablename [expr {$i - 1}]] - } else { - set lastChar "" - } - - #todo - track\count backslashes properly - - set c [tcl::string::index $tablename $i] - if {$c eq "\""} { - if {($lastChar eq "\\")} { - #not strictly correct - we could have had an even number prior-backslash sequence - #the toml spec would have us error out immediately on bsl in bad location - but we're - #trying to parse to unvalidated tomlish - set ctest escq - } else { - set ctest dq - } - } else { - set ctest [string map [list " " sp \t tab] $c] - } - - switch -- $ctest { - . { - switch -exact -- $mode { - preval { - error "tablename_split. dot not allowed - expecting a value" - } - unquoted { - #dot marks end of segment. - if {![tomlish::utils::is_barekey $seg]} { - error "tablename_split. unquoted key segment $seg is not a valid toml key" - } - lappend segments $seg - set seg "" - set mode "preval" - } - quoted { - append seg $c - } - litquoted { - append seg $c - } - postval { - #got dot in an expected location - set mode "preval" - } - } - } - dq { - #unescaped dquote - switch -- $mode { - preval { - set mode "quoted" - set seg "\"" - } - unquoted { - #invalid in barekey - but we are after structure only - append seg $c - } - quoted { - append seg $c - if {$normalize} { - lappend segments [::tomlish::utils::unescape_string [tcl::string::range $seg 1 end-1]] - } else { - lappend segments $seg - } - set seg "" - set mode "postval" ;#make sure we only accept a dot or end-of-data now. - } - litquoted { - append seg $c - } - postval { - error "tablename_split. expected whitespace or dot, got double quote. tablename: '$tablename'" - } - } - } - ' { - switch -- $mode { - preval { - append seg $c - set mode "litquoted" - } - unquoted { - #single quote inside e.g o'neill - ultimately invalid - but we pass through here. - append seg $c - } - quoted { - append seg $c - } - litquoted { - append seg $c - #no normalization to do aside from stripping squotes - if {$normalize} { - lappend segments [tcl::string::range $seg 1 end-1] - } else { - lappend segments $seg - } - set seg "" - set mode "postval" - } - postval { - error "tablename_split. expected whitespace or dot, got single quote. tablename: '$tablename'" - } - } - } - sp - tab { - switch -- $mode { - preval - postval { - #ignore - } - unquoted { - #terminates a barekey - lappend segments $seg - set seg "" - set mode "postval" - } - default { - #append to quoted or litquoted - append seg $c - } - } - } - default { - switch -- $mode { - preval { - set mode unquoted - append seg $c - } - postval { - error "tablename_split. Expected a dot separator. got '$c'. tablename: '$tablename'" - } - default { - append seg $c - } - } - } - } - - if {$i == $sLen-1} { - #end of data - ::tomlish::log::debug "End of data: mode='$mode'" - switch -exact -- $mode { - preval { - error "tablename_split. Expected a value after last dot separator. tablename: '$tablename'" - } - unquoted { - if {![tomlish::utils::is_barekey $seg]} { - #e.g toml-test invalid/table/with-pound required to fail for invalid barekey - error "tablename_split. unquoted key segment $seg is not a valid toml key" - } - lappend segments $seg - } - quoted { - error "tablename_split. Expected a trailing double quote. tablename: '$tablename'" - } - litquoted { - error "tablename_split. Expected a trailing single quote. tablename: '$tablename'" - } - postval { - #ok - segment already lappended - } - } - } - } - - #note - we must allow 'empty' quoted strings '' & "" - # (these are 'discouraged' but valid toml keys) - - return $segments - } - - #tablenames (& tablearraynames) may contain irrelevant leading, trailing and interspersed whitespace - # tablenames can be made up of segments delimited by dots. .eg [ a.b . c ] - #trimmed, the tablename becomes {a.b.c} - # A segment may contain whitespace if it is quoted e.g [a . b . "c etc " ] - #ie whitespace is only irrelevant if it's outside a quoted segment - #trimmed, the tablename becomes {a.b."c etc "} - proc tablename_trim {tablename} { - set segments [tablename_split $tablename false] - set trimmed_segments [list] - foreach seg $segments { - lappend trimmed_segments [::string trim $seg " \t"] - } - return [join $trimmed_segments .] - } - - proc get_dottedkey_info {dottedkeyrecord} { - set key_hierarchy [list] - set key_hierarchy_raw [list] - if {[lindex $dottedkeyrecord 0] ne "DOTTEDKEY"} { - error "tomlish::to_dict::get_dottedkey_info error. Supplied list doesn't appear to be a DOTTEDKEY (tag: [lindex $dottedkeyrecord 0])" - } - set compoundkeylist [lindex $dottedkeyrecord 1] - set expect_sep 0 - foreach part $compoundkeylist { - set parttag [lindex $part 0] - if {$parttag eq "WS"} { - continue - } - if {$expect_sep} { - if {$parttag ne "DOTSEP"} { - error "DOTTEDKEY missing dot separator between parts. '$dottedkeyrecord'" - } - set expect_sep 0 - } else { - set val [lindex $part 1] - switch -exact -- $parttag { - KEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw $val - } - DQKEY { - lappend key_hierarchy [::tomlish::utils::unescape_string $val] - lappend key_hierarchy_raw \"$val\" - } - SQKEY { - lappend key_hierarchy $val - lappend key_hierarchy_raw "'$val'" - } - default { - error "tomlish::to_dict::get_dottedkey_info DOTTED key unexpected part '$parttag' - ensure dot separator is between key parts. '$item'" - } - } - set expect_sep 1 - } - } - return [dict create keys $key_hierarchy keys_raw $key_hierarchy_raw] - } - -} - - -tcl::namespace::eval tomlish::app { - #*** !doctools - #[subsection {Namespace tomlish::app}] - #[para] - #[list_begin definitions] - - tcl::namespace::eval argdoc { - proc test_suites {} { - if {[package provide test::tomlish] eq ""} { - return [list] - } - return [test::tomlish::SUITES] - } - } - - package require punk::args - punk::args::define { - @id -id ::tomlish::app::decoder - @cmd -name tomlish::app::decoder -help\ - "Read toml on stdin until EOF - on error - returns non-zero exit code and writes error to - the errorchannel. - on success - returns zero exit code and writes JSON encoding - of the data to the outputchannel. - This decoder is intended to be compatble with toml-test." - @leaders -min 0 -max 0 - @opts - -help -type none -help\ - "Display this usage message" - -inputchannel -default stdin - -outputchannel -default stdout - -errorchannel -default stderr - @values -min 0 -max 0 - } - proc decoder {args} { - set argd [punk::args::parse $args withid ::tomlish::app::decoder] - set ch_input [dict get $argd opts -inputchannel] - set ch_output [dict get $argd opts -outputchannel] - set ch_error [dict get $argd opts -errorchannel] - if {[dict exists $argd received -help]} { - return [punk::args::usage -scheme info ::tomlish::app::decoder] - } - - #fconfigure stdin -encoding utf-8 - fconfigure $ch_input -translation binary - #Just slurp it all - presumably we are not handling massive amounts of data on stdin. - # - even if the input is large, we probably don't gain much (aside from possible memory savings?) by attempting to process input as it arrives. - if {[catch { - set toml [read $ch_input] - }]} { - exit 2 ;#read error - } - try { - set j [::tomlish::toml_to_json $toml] - } on error {em} { - puts $ch_error "decoding failed: '$em'" - exit 1 - } - puts -nonewline $ch_output $j - exit 0 - } - - package require punk::args - punk::args::define { - @id -id ::tomlish::app::encoder - @cmd -name tomlish::app::encoder -help\ - "Read JSON on input until EOF - return non-zero exitcode if JSON data cannot be converted to - a valid TOML representation. - return zero exitcode and TOML data on output if JSON data can - be converted. - This encoder is intended to be compatible with toml-test." - @leaders -min 0 -max 0 - @opts - -help -type none -help \ - "Display this usage message" - -inputchannel -default stdin - -outputchannel -default stdout - -errorchannel -default stderr - @values -min 0 -max 0 - } - proc encoder {args} { - set argd [punk::args::parse $args withid ::tomlish::app::encoder] - set ch_input [dict get $argd opts -inputchannel] - set ch_output [dict get $argd opts -outputchannel] - set ch_error [dict get $argd opts -errorchannel] - if {[dict exists $argd received -help]} { - return [punk::args::usage -scheme info ::tomlish::app::encoder] - } - #review - fconfigure $ch_input -translation binary - if {[catch { - set json [read $ch_input] - }]} { - exit 2 ;#read error - } - try { - set toml [::tomlish::json_to_toml $json] - } on error {em} { - puts $ch_error "encoding failed: '$em'" - exit 1 - } - puts -nonewline $ch_output $toml - exit 0 - } - - punk::args::define { - @dynamic - @id -id ::tomlish::app::test - @cmd -name tomlish::app::test - @leaders - @opts -any 1 - -help -type none -help\ - "Display this usage message - or further info if more args." - -suite -default tests -choices {${[::tomlish::app::argdoc::test_suites]}} - @values -min 0 -max -1 - } - proc test {args} { - package require test::tomlish - set argd [punk::args::parse $args withid ::tomlish::app::test] - set opts [dict get $argd opts] - set values [dict get $argd values] - set received [dict get $argd received] - set solos [dict get $argd solos] - set opt_suite [dict get $opts -suite] - if {[dict exists $received -help] && ![dict exists $received -suite]} { - return [punk::args::usage -scheme info ::tomlish::app::test] - } - - test::tomlish::SUITE $opt_suite - #if {[catch {test::tomlish::SUITE $opt_suite} errM]} { - # puts stderr "Unknown test suite '$opt_suite'. Available suites: [test::tomlish::SUITES]" - # exit 1 - #} - set run_opts [dict remove $opts -suite] - set run_opts [dict remove $run_opts {*}$solos] - set result [test::tomlish::RUN {*}$run_opts {*}$solos {*}$values] - return $result - } - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::app ---}] -} - -proc ::tomlish::appnames {} { - set applist [list] - foreach cmd [info commands ::tomlish::app::*] { - lappend applist [namespace tail $cmd] - } - return $applist -} - - -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -# Secondary API namespace -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ -namespace eval tomlish::lib { - namespace export {[a-z]*}; # Convention: export all lowercase - namespace path [namespace parent] - #*** !doctools - #[subsection {Namespace tomlish::lib}] - #[para] Secondary functions that are part of the API - #[list_begin definitions] - - #proc utility1 {p1 args} { - # #*** !doctools - # #[call lib::[fun utility1] [arg p1] [opt {?option value...?}]] - # #[para]Description of utility1 - # return 1 - #} - - - - #*** !doctools - #[list_end] [comment {--- end definitions namespace tomlish::lib ---}] -} -# ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ - -if {[info exists ::argc] && $::argc > 0} { - #puts stderr "argc: $::argc args: $::argv" - set arglist $::argv - # -------------- - #make sure any dependant packages that are sourced don't get any commandline args - set ::argv {} - set ::argc 0 - # -------------- - package require punk::args - punk::args::define { - @dynamic - @id -id tomlish::cmdline - @cmd -name tomlish -help\ - "toml encoder/decoder written in Tcl" - @opts -any 1 - -help -type none -help\ - "Display this usage message or more specific - help if further arguments provided." - -app -choices {${[tomlish::appnames]}} - } - try { - set argd [punk::args::parse $arglist withid tomlish::cmdline] - } trap {PUNKARGS VALIDATION} {msg erroropts} { - puts stderr $msg - exit 1 - } - - - lassign [dict values $argd] leaders opts values received solos - if {[dict exists $received -help] && ![dict exists $received -app]} { - #only emit cmdline help if -app not supplied as well - otherwise app function can act on -help for more specific help - #puts stdout "Usage: -app where appname one of:[tomlish::appnames]" - puts stdout [punk::args::usage -scheme info tomlish::cmdline] - exit 0 - } - if {![dict exists $received -app]} { - puts stderr [punk::args::usage -scheme error tomlish::cmdline] - exit 1 - } - - set app [dict get $opts -app] - set appnames [tomlish::appnames] - set app_opts [dict remove $opts -app {*}$solos] - try { - set result [tomlish::app::$app {*}$app_opts {*}$solos {*}$values] - } trap {PUNKARGS VALIDATION} {msg erroropts} { - #The validation error should fully describe the issue - #no need for errortrace - keep the output cleaner - puts stderr $msg - exit 1 - } trap {} {msg erroropts} { - #unexpected error - uncaught throw will produce error trace - #todo - a support msg? Otherwise we may as well just leave off this trap. - throw [dict get $erroropts -errorcode] [dict get $erroropts -errorinfo] - } - if {"-help" in $solos} { - puts stderr $result - exit 1 - } else { - if {$result ne ""} { - puts stdout $result - exit 0 - } - } - - #set opts [dict create] - #set opts [dict merge $opts $::argv] - - #set opts_understood [list -app ] - #if {"-app" in [dict keys $opts]} { - # #Don't vet the remaining opts - as they are interpreted by each app - #} else { - # foreach key [dict keys $opts] { - # if {$key ni $opts_understood} { - # puts stderr "Option '$key' not understood" - # exit 1 - # } - # } - #} - #if {[dict exists $opts -app]} { - # set app [dict get $opts -app] - # set appnames [tomlish::appnames] - # if {$app ni $appnames} { - # puts stderr "app '[dict get $opts -app]' not found. Available apps: $appnames" - # exit 1 - # } - # tomlish::app::$app {*}$opts - #} -} - -## Ready -package provide tomlish [namespace eval tomlish { - variable pkg tomlish - variable version - set version 1.1.5 -}] -return - -#*** !doctools -#[manpage_end] - diff --git a/src/vfs/_vfscommon.vfs/modules/punk/ansi-0.1.1.tm b/src/vfs/_vfscommon.vfs/modules/punk/ansi-0.1.1.tm index 6b04827d..3c20391f 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/ansi-0.1.1.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/ansi-0.1.1.tm @@ -6439,6 +6439,8 @@ tcl::namespace::eval punk::ansi::class { set o_gx0states [list] set o_splitindex [list] + #sha1 takes *much* longer to compute than md5 if tcllibc not available - otherwise it is generally faster + #we should fall back to md5 if no acceleration available. check for command sha1::sha1c ? set o_cksum_command [list sha1::sha1 -hex] diff --git a/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm b/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm index 7710fa00..ab1ca020 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm @@ -876,7 +876,7 @@ tcl::namespace::eval punk::args { set cache_key $args #ideally we would use a fast hash algorithm to produce a short key with low collision probability. - #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus) + #something like md5 would be ok (this is non cryptographic) - but md5 uses open and isn't usable by default in a safe interp. (sha1 often faster on modern cpus but terribly slow without an accelerator) #review - check if there is a built-into-tcl way to do this quickly #for now we will just key using the whole string #performance seems ok - memory usage probably not ideal diff --git a/src/vfs/_vfscommon.vfs/modules/punk/libunknown-0.1.tm b/src/vfs/_vfscommon.vfs/modules/punk/libunknown-0.1.tm index 88b91288..a4f56010 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/libunknown-0.1.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/libunknown-0.1.tm @@ -847,22 +847,24 @@ tcl::namespace::eval punk::libunknown { #keys are in reverse order due to tclPkgUnknown processing order set ordered_searchpaths [lreverse [dict keys $dict_added]];# ordered as in auto_path + #JN todo - look at tm epochs dict for {pkg versiond} $refresh_dict { set versions [dict keys $versiond] + #puts stderr "---->pkg:$pkg versions: $versions" foreach searchpath $ordered_searchpaths { set addedinfo [dict get $dict_added $searchpath] set vidx -1 foreach v $versions { incr vidx if {[dict exists $addedinfo $pkg $v]} { - ledit versions $vidx $vidx + ledit versions $vidx $vidx ;incr vidx -1 ;#maintain vidx as index into current state of $versions - not original state the foreach operates across. set iscript [dict get $addedinfo $pkg $v scr] #todo - find the iscript in the '$epoch pkg epochs added paths' lists and determine os vs dev vs internal #(scanning for path directly in the ifneeded script for pkgs is potentially error prone) #for .tm ifneeded scripts - the syntax is simple enough to determine directly (and ifneeded scr not stored for those anyway) set justaddedscript [package ifneeded $pkg $v] if {$justaddedscript ne $iscript} { - puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" + #puts "---->refreshing $pkg $v - reverting to already stored from path:$searchpath versions: $versions" package ifneeded $pkg $v $iscript #dict set pkgvdone $pkg $v 1 } diff --git a/src/vfs/_vfscommon.vfs/modules/punk/mix/base-0.1.tm b/src/vfs/_vfscommon.vfs/modules/punk/mix/base-0.1.tm index a4bc3c70..ea9fc85f 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/mix/base-0.1.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/mix/base-0.1.tm @@ -430,7 +430,16 @@ namespace eval punk::mix::base { #not just used by cksum_path. used by caller (e.g fill_relativecksums_from_base_and_relativepathdict via cksum_filter_opts) to determine what opt names passed through variable cksum_default_opts - set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm sha1] + set default_hash sha1 ;#but fall back to md5 if either sha1 is unavailable or unaccelerated (pure tcl sha1 is way slower than pure tcl md5 - can take minutes on even moderate sized source files) + if {![catch {package require sha1}]} { + set impls [::sha1::Implementations] + if {[llength $impls] == 1 && [string tolower [lindex $impls 0]] eq "tcl"} { + set default_hash md5 + } + } else { + set default_hash md5 + } + set cksum_default_opts [dict create -cksum_content 1 -cksum_meta auto -cksum_acls 0 -cksum_usetar auto -cksum_algorithm $default_hash] proc cksum_default_opts {} { variable cksum_default_opts return $cksum_default_opts @@ -438,9 +447,9 @@ namespace eval punk::mix::base { #crc::cksum is extremely slow in tcllib as at 2023 e.g 20x slower (no c implementation?) # - try builtin zlib crc instead? - #sha1 is performant - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. + #sha1 is performant (when accelerator present) - and this is not being used in a cryptographic or adversarial context - so performance and practical unlikelihood of accidental collisions should be the main consideration. #adler32 is fastest for some larger files of a few MB but slower on small files (possibly due to Tcl-based file load?) - #sha1 as at 2023 seems a reasonable default + #sha1 as at 2023 seems a reasonable default - (but only if accelerator present) proc cksum_algorithms {} { variable sha3_implementation #sha2 is an alias for sha256 diff --git a/src/vfs/_vfscommon.vfs/modules/punkcheck-0.1.0.tm b/src/vfs/_vfscommon.vfs/modules/punkcheck-0.1.0.tm index 50bcc2f8..199d06bf 100644 --- a/src/vfs/_vfscommon.vfs/modules/punkcheck-0.1.0.tm +++ b/src/vfs/_vfscommon.vfs/modules/punkcheck-0.1.0.tm @@ -340,7 +340,9 @@ namespace eval punkcheck { set ts_now [clock microseconds] set metadata_us [expr {$ts_now - $ts_start}] - dict set installing_record -metadata_us $metadata_us + #?? + #JJJ + #dict set installing_record -metadata_us $metadata_us dict set installing_record -ts_start_transfer $ts_now lset fileinfo_body end $installing_record @@ -473,6 +475,7 @@ namespace eval punkcheck { set path_cksum_caches [list] } set o_fileset_record [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $rel_source_path $o_fileset_record $path_cksum_caches] + #JJJ - update -metadata_us here? } method targetset_last_complete {} { @@ -1599,6 +1602,7 @@ namespace eval punkcheck { #puts stdout "Current target dir: $current_target_dir" + set last_processed_dir "" foreach m $match_list { set new_tgt_cksum_info [list] set relative_target_path [file join $relative_target_dir $m] @@ -1617,6 +1621,13 @@ namespace eval punkcheck { continue } #puts stdout " checking file : $current_source_dir/$m" + set thismatchdir [file dirname $m] + if {$last_processed_dir ne $thismatchdir} { + puts stdout "\n checking files in [file join $current_source_dir $thismatchdir]" + set last_processed_dir $thismatchdir + } else { + puts -nonewline stdout . + } set ts_start [clock microseconds] set seconds [expr {$ts_start / 1000000}] set ts_start_iso [clock format $seconds -format "%Y-%m-%dT%H:%M:%S"] @@ -1657,7 +1668,25 @@ namespace eval punkcheck { #Note this isn't a recordlist function - so it doesn't purely operate on the records #this hits the filesystem for the sourcepath - gets checksums/timestamps depending on config. #It doesn't save to .punkcheck (the only punkcheck::installfile_ method which doesn't) + set ts1 [clock milliseconds] set filerec [punkcheck::installfile_add_source_and_fetch_metadata $punkcheck_folder $relative_source_path $filerec] + set ts2 [clock milliseconds] + set diff [expr {$ts2 - $ts1}] + if {$diff > 100} { + set errprefix ">>> punkcheck:" + puts stderr "$errprefix performance warning: fetch_metadata for $m took $diff ms." + set lb [lindex [dict get $filerec body] end] + #puts stderr "$errprefix filerec last body record:$lb" + set records [dict get $lb body] + set lr [lindex $records end] + set alg [dict get $lr -cksum_all_opts -cksum_algorithm] + if {$alg eq "sha1"} { + puts "$errprefix cksum_algorithm: sha1 (accelerators: [::sha1::Implementations])" + puts "$errprefix sha1 from: [package ifneeded sha1 [package present sha1]]" + } else { + puts "$errprefix cksum_algorithm: $alg" + } + } @@ -1681,6 +1710,7 @@ namespace eval punkcheck { lappend files_copied $current_source_dir/$m } else { if {![file exists $current_target_dir/$m]} { + puts stderr "punkcheck: first copy to $current_target_dir/$m " file mkdir $current_target_dir file copy $current_source_dir/$m $current_target_dir set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] @@ -1691,9 +1721,15 @@ namespace eval punkcheck { installedsourcechanged-targets { if {[llength $changed]} { #An unrecorded installation is considered a source change (from unknown/unrecorded source to recorded) + puts -nonewline stderr "punkcheck: about to: file copy -force $current_source_dir/$m $current_target_dir" + set ts1 [clock milliseconds] file mkdir $current_target_dir file copy -force $current_source_dir/$m $current_target_dir + set ts2 [clock milliseconds] + puts -nonewline stderr " (copy time [expr {$ts2 - $ts1}] ms)" set new_tgt_cksum_info [punk::mix::base::lib::cksum_path $current_target_dir/$m] + set ts3 [clock milliseconds] + puts stderr " (cksum time [expr {$ts2 - $ts1}] ms)" lappend files_copied $current_source_dir/$m } else { set is_skip 1 diff --git a/src/vfs/_vfscommon.vfs/modules/textblock-0.1.3.tm b/src/vfs/_vfscommon.vfs/modules/textblock-0.1.3.tm index d9858980..c89b3594 100644 --- a/src/vfs/_vfscommon.vfs/modules/textblock-0.1.3.tm +++ b/src/vfs/_vfscommon.vfs/modules/textblock-0.1.3.tm @@ -95,6 +95,7 @@ tcl::namespace::eval textblock { #NOTE sha1, although computationally more intensive, tends to be faster than md5 on modern cpus #(more likely to be optimised for modern cpu features?) #(This speed improvement may not apply for short strings) + #This is probably only true if tcllibc is available - pure-tcl sha1 is excruciatingly slow... variable use_hash ;#framecache set use_hash none ;#slightly faster but uglier layout for viewing frame_cache display