From 2dd24a7ba9bb2a5f3bee47ff896e9f9cb4aefcba Mon Sep 17 00:00:00 2001 From: Julian Noble Date: Wed, 13 Aug 2025 04:10:17 +1000 Subject: [PATCH] punk::args add -defaultdisplaytype dict|list; punk::pdf improvements --- src/bootsupport/modules/flagfilter-0.3.tm | 5 +- src/bootsupport/modules/punk-0.1.tm | 16 +- src/bootsupport/modules/punk/ansi-0.1.1.tm | 237 +++++- src/bootsupport/modules/punk/args-0.2.tm | 39 +- src/bootsupport/modules/punk/char-0.1.0.tm | 101 +++ src/bootsupport/modules/punk/console-0.1.1.tm | 1 + src/bootsupport/modules/punk/lib-0.1.2.tm | 126 ++- src/bootsupport/modules/punk/ns-0.1.0.tm | 16 +- src/bootsupport/modules/shellfilter-0.2.tm | 40 +- src/bootsupport/modules/textblock-0.1.3.tm | 28 +- src/modules/punk/args-999999.0a1.0.tm | 39 +- src/modules/punk/pdf-999999.0a1.0.tm | 799 ++++++++++++++---- .../custom/_project/punk.basic/src/make.tcl | 2 +- .../src/bootsupport/modules/flagfilter-0.3.tm | 5 +- .../src/bootsupport/modules/punk-0.1.tm | 16 +- .../bootsupport/modules/punk/ansi-0.1.1.tm | 237 +++++- .../src/bootsupport/modules/punk/args-0.2.tm | 39 +- .../bootsupport/modules/punk/char-0.1.0.tm | 101 +++ .../bootsupport/modules/punk/console-0.1.1.tm | 1 + .../src/bootsupport/modules/punk/lib-0.1.2.tm | 126 ++- .../src/bootsupport/modules/punk/ns-0.1.0.tm | 16 +- .../bootsupport/modules/shellfilter-0.2.tm | 40 +- .../bootsupport/modules/textblock-0.1.3.tm | 28 +- .../_project/punk.project-0.1/src/make.tcl | 2 +- .../src/bootsupport/modules/flagfilter-0.3.tm | 5 +- .../src/bootsupport/modules/punk-0.1.tm | 16 +- .../bootsupport/modules/punk/ansi-0.1.1.tm | 237 +++++- .../src/bootsupport/modules/punk/args-0.2.tm | 39 +- .../bootsupport/modules/punk/char-0.1.0.tm | 101 +++ .../bootsupport/modules/punk/console-0.1.1.tm | 1 + .../src/bootsupport/modules/punk/lib-0.1.2.tm | 126 ++- .../src/bootsupport/modules/punk/ns-0.1.0.tm | 16 +- .../bootsupport/modules/shellfilter-0.2.tm | 40 +- .../bootsupport/modules/textblock-0.1.3.tm | 28 +- .../_project/punk.shell-0.1/src/make.tcl | 2 +- .../_vfscommon.vfs/modules/punk/args-0.2.tm | 39 +- .../_vfscommon.vfs/modules/punk/pdf-0.1.0.tm | 799 ++++++++++++++---- 37 files changed, 2949 insertions(+), 560 deletions(-) diff --git a/src/bootsupport/modules/flagfilter-0.3.tm b/src/bootsupport/modules/flagfilter-0.3.tm index 1d37e215..00f58e82 100644 --- a/src/bootsupport/modules/flagfilter-0.3.tm +++ b/src/bootsupport/modules/flagfilter-0.3.tm @@ -1538,6 +1538,7 @@ namespace eval flagfilter { } } + #todo - rename 'cprocessor' is misleading oo::class create cprocessor { variable o_runid variable o_name @@ -1577,7 +1578,9 @@ namespace eval flagfilter { if {[dict exists $o_pinfo match]} { set o_matchspec [dict get $o_pinfo match] } else { - set o_matchspec {^[^-^/].*} ;#match anything that isn't flaglike + #review - unix paths? conflict with windows style flag such as /w + #must accept empty string + set o_matchspec {^[^-^/].*|^$} ;#match anything that isn't flaglike } set o_found_match 0 set o_matched_argument "" ;#need o_found_match to differentiate match of empty string diff --git a/src/bootsupport/modules/punk-0.1.tm b/src/bootsupport/modules/punk-0.1.tm index 6fb185a9..83dad2bf 100644 --- a/src/bootsupport/modules/punk-0.1.tm +++ b/src/bootsupport/modules/punk-0.1.tm @@ -6947,7 +6947,8 @@ namespace eval punk { set newrow {} foreach oldrow $list_rows { if {$j >= [llength $oldrow]} { - continue + #continue + lappend newrow "" } else { lappend newrow [lindex $oldrow $j] } @@ -6956,6 +6957,19 @@ namespace eval punk { } return $res } + proc transpose_equal_lists {list_rows} { + set columns [list] + set rowidx -1 + foreach l $list_rows { + set colidx -1 + incr rowidx + foreach val $l { + incr colidx + lset columns $colidx $rowidx $val + } + } + return $columns + } proc transpose_strings {list_of_strings} { set charlists [lmap v $list_of_strings {split $v ""}] set tchars [transpose_lists $charlists] diff --git a/src/bootsupport/modules/punk/ansi-0.1.1.tm b/src/bootsupport/modules/punk/ansi-0.1.1.tm index 255715ad..ad2d58f4 100644 --- a/src/bootsupport/modules/punk/ansi-0.1.1.tm +++ b/src/bootsupport/modules/punk/ansi-0.1.1.tm @@ -3785,7 +3785,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -3892,7 +3892,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -4158,7 +4158,8 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu # where line and column are ascii codes whose values are +31 # vt52 can be entered/exited via escapes # This means we probably need to to wrap enter/exit vt52 and keep this state - as we don't have a standard way to query for terminal type - # (vt52 supports ESC Z - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + # (vt52 supports ESC Z (obs DECID) - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + #ESC\[c - is more modern equiv of DECID lappend PUNKARGS [list { @id -id ::punk::ansi::vt52move @@ -4946,6 +4947,8 @@ to 223 (=255 - 32) } if {[string length $text] < 2} {return $text} set parts [punk::ansi::ta::split_codes $text] + #review - if we have only one element of a paired codeset such as PM,SOS - it will not be found by split_codes + #The output technically then still contains ansi (which may for example be hidden by terminal despite lack of closing ST) if {[llength $parts] == 1} {return [lindex $parts 0]} set out "" #todo - try: join [lsearch -stride 2 -index 0 -subindices -all -inline $parts *] "" @@ -5686,21 +5689,106 @@ tcl::namespace::eval punk::ansi { #[list_end] [comment {--- end definitions namespace punk::ansi::codetype ---}] } tcl::namespace::eval sequence_type { - proc is_Fe {code} { + #first byte after ESC identifies code type + #NOTE - we are looking for valid start of a single sequence here + #- not whether it is complete or where it ends, unless it's a fixed number of bytes + + #\u0020-\u002F + # ESC !"#$%&'()*+,-./ + + #\u0030-\u003F + #ESC 0-9:;<=>? + + #\u0040-\u005F + # ESC @A-Z[\]^ + + #\u0060-\u007E + + proc is_Fe7 {code} { # C1 control codes - if {[regexp {^\033\[[\u0040-\u005F]}]} { - #7bit - typical case - return 1 - } + #7bit - typical case + # ESC @A-Z[\]^ + return [regexp {^\033[\u0040-\u005F]} $code] + } + proc is_Fe {code} { + #although Fe7 more common - we'll put the simpler regex for 8 first + return [expr {[is_Fe8 $code] || [is_Fe7 $code]}] + } + proc is_Fe8 {code} { #8bit - #review - all C1 escapes ? 0x80-0x90F + #review - all C1 escapes ? 0x80-0x9F #This is possibly problematic as it is affected by encoding. #According to https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit #"However, in character encodings used on modern devices such as UTF-8 or CP-1252, those codes are often used for other purposes, so only the 2-byte sequence is typically used." - return 0 + return [regexp {^[\u0080-\u09F]} $code] + } + #ESC 0-9,:,;,<,=,>,? + proc is_Fp {code} { + #single byte following ESC + return [regexp {^\033[\u0030-\u003F]$} $code] } + + #https://en.wikipedia.org/wiki/ISO/IEC_2022 + #e.g + # ESC a (INT) interrupts the current process + # ESC c (RIS) reset terminal to initial state + #ESC `a-z{|}~ proc is_Fs {code} { - puts stderr "is_Fs unimplemented" + #single byte following ESC + return [regexp {^\033[\u0060-\u007E]$} $code] + } + + + proc is_nF {code} { + #2 bytes + #subcategorised by the low two bits of the first byte (n) + #further by whether the final byte is in \u0030-u003f (p) or not (t) + return [regexp {^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + + #review - test + #3Fp - private use + #e.g vt100 + # ESC#3 DECDHL double-height letters top half + # ESC#4 DECDHL double-height letters bottom half + # ESC#5 DECSWL single-width line + # ESC#6 DECDWL double-width line + proc is_3Fp {code} { + return [regexp {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} $code] ;#check regexp + } + + proc is_code7 {code} { + #Fe | Fs | Fp | nF | Fe + return [regexp {^\033[\u0040-\u005F]|^\033[\u0060-\u007e]$|^\033[\u0030-\u003F]$|^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + proc is_code8 {code} { + return [regexp {^[\u0080-\u09F]} $code] + } + proc is_code {code} { + return [expr {[is_code8 $code] || [is_code7 $code]}] + } + + proc classify {code} { + return [switch -regexp -- $code { + {^\033[\u0030-\u003F]$} {string cat Fp} + {^[\u0080-\u009F]|^\033[\u0040-\u005F]} {string cat Fe} + {^\033[\u0060-\u007E]$} {string cat Fs} + {^\033[\u0020-\u002F]+[\u0030-\u007E]$} { + #nF sequences + set firstbytenum [scan [string index $code 1] %c] + set lastbyte [string index $code end] + + set n [expr {$firstbytenum & 3}] + if {[regexp {[\u0030-\u003F]} $lastbyte]} { + set tp p + } else { + set tp t + } + string cat ${n}F$tp + } + {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} {string cat 3Fp} + default {string cat unknown} + }] } } # -- --- --- --- --- --- --- --- --- --- --- @@ -5718,6 +5806,7 @@ tcl::namespace::eval punk::ansi::ta { #[para] https://github.com/perlancar/perl-Text-ANSI-Util/blob/master/lib/Text/ANSI/BaseUtil.pm #[list_begin definitions] tcl::namespace::path ::punk::ansi + namespace export detect detect_in_list detect_sgr extract length split_codes split_at_codes split_codes_single variable PUNKARGS @@ -5748,20 +5837,27 @@ tcl::namespace::eval punk::ansi::ta { variable re_osc_open {(?:\x1b\]|\u009d).*} + #review - distinguishing standalone codes vs those that are paired with contents considered part of the code + #e.g PM,SOS are 'paired' ended by ST + #variable standalone_code_map [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> "" \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] variable re_standalones {(?:\x1bc|\x1b7|\x1b8|\x1bM|\x1bD|\x1bE|\x1bH|\x1b=|\x1b>|\x1b#3|\x1b#4|\x1b#5|\x1b#6|\x1b#8)} variable re_standalones_vt52 {(?:\x1bZ)} - #ESC Y move, ESC b foreground colour + # -- + #ESC Y move - \x1bY ie 2 bytes to close + #ESC b foreground colour - \x1bb 1 byte to close + variable re_vt52_incomplete {(?:\x1bY(.){0,1}$|\x1bb$)} + #\x1bc vt52 bgcolour conflict ? #ESC F - gr-on ESC G - gr-off - variable re_vt52_open {(?:\x1bY|\x1bb|\x1bF)} - #\x1bc vt52 bgcolour conflict ?? + # -- #if we don't split on altgraphics too and separate them out - it's easy to get into a horrible mess variable re_g0_group {(?:\x1b\(0)(?:(?!\x1b\(B).)*\x1b\(B} variable re_g0_open {(?:\x1b\(0)} variable re_g0_close {(?:\x1b\(B)} + #detect start of ansicode that is closed by ST # DCS "ESC P" or "0x90" is also terminated by ST set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} #ST terminators [list \007 \033\\ \u009c] @@ -5777,12 +5873,15 @@ tcl::namespace::eval punk::ansi::ta { variable re_ST {(?:\x1bP|\x1bX|\x1b\^|\x1b_)(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007)|(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c)} + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_vt52_incomplete}" #consider standalones as self-opening/self-closing - therefore included in both ansi_detect and ansi_detect_open #default for regexes is non-newline-sensitive matching - ie matches can span lines # -- --- --- --- - variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" + #variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" # -- --- --- --- #handrafted TRIE version of above. Somewhat difficult to construct and maintain. TODO - find a regexp TRIE generator that works with Tcl regexes #This does make things quicker - but it's too early to finalise the detect/split regexes (e.g missing \U0090 ) - will need to be redone. @@ -5802,20 +5901,24 @@ tcl::namespace::eval punk::ansi::ta { # plus more for auxiliary keypad codes in keypad application mode (and some in numeric mode) #regexp expanded syntax = ?x + #full detect - checking for closing sequences variable re_ansi_detect {(?x) - (?:\x1b(?:\[(?:[\x20-\x2f\x30-\x3f]*[\x40-\x7e])|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) + (?:\x1b(?:\[(?:[\x20-\x3f]*[\x40-\x7e])|a|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) |(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c) - |(?:\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e] + |(?:\u009b)[\x20-\x3f]*[\x40-\x7e] |(?:\u009d)(?:[^\u009c]*)?\u009c } #--- + #todo + #variable re_ansi_detectcode $re_ansi_detect + #variable re_ansi_detectcode {\x1b[\u0040-\u005F]|\x1b[\u0060-\u007e]|\x1b[\u0030-\u003F]|\x1b[\u0020-\u002F]+[\u0030-\u007E]} + variable re_ansi_detectcode {(?:\x1b(?:[\u0030-\u007E]|[\u0020-\u002F]+[\u0030-\u007E]))|[\u0090-\u009F]} # -- --- --- --- #variable re_csi_code {(?:\x1b\[|\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e]} - variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" #may be same as detect - kept in case detect needs to diverge #variable re_ansi_split "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" @@ -5827,20 +5930,6 @@ tcl::namespace::eval punk::ansi::ta { set re_ansi_split_multi "(?:${re_ansi_split})+" } - lappend PUNKARGS [list -dynamic 0 { - @id -id ::punk::ansi::ta::detect - @cmd -name punk::ansi::ta::detect -help\ - "Return a boolean indicating whether Ansi codes were detected in text. - Important caveat: - When text is a tcl list made from splitting (or lappending) some ansi string - - individual elements may be braced or have certain chars escaped. - (one example is if a list element contains an unbalanced brace) - This can cause square brackets that form part of the ansi to be backslash escaped - - and the function can fail to match it as an Ansi code. - " - @values -min 1 - text -type string - } ] #*** !doctools #[call [fun detect] [arg text]] @@ -5851,10 +5940,35 @@ tcl::namespace::eval punk::ansi::ta { #detect any ansi escapes #review - only detect 'complete' codes - or just use the opening escapes for performance? + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::detect + @cmd -name punk::ansi::ta::detect\ + -summary\ + "Test if text has completed ANSI codes"\ + -help\ + "Return a boolean indicating whether *complete* Ansi codes were detected in text. + + By complete, it means that paired squences such as PM (privacy message) must be + closed. It also means that a truncated sequence such as \\x1b\\\[ or a lone escape + will not be detected as ANSI. + + Use punk::ansi::ta::detectcode as a slightly faster detector for ANSI codes, that does + not require paired sequences to have both starting and end sequences to be detected. + + Important caveat: + When text is a tcl list made from splitting (or lappending) some ansi string + - individual elements may be braced or have certain chars escaped. + (one example is if a list element contains an unbalanced brace) + This can cause square brackets that form part of the ansi to be backslash escaped + - and the function can fail to match it as an Ansi code. + " + @values -min 1 + text -type string -help\ + "Block of text. See caveat above about lists." + } ] proc detect {text} [string map [list [list $re_ansi_detect]] { regexp $text }] - #can be used on dicts - but will check keys too. keys could also contain ansi and have escapes proc detect_in_list {list} { #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) @@ -5865,6 +5979,22 @@ tcl::namespace::eval punk::ansi::ta { } return 0 } + + #will detect for example lone opening or closing PM + proc detectcode {text} [string map [list [list $re_ansi_detectcode]] { + regexp $text + }] + proc detectcode_in_list {list} { + #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) + foreach item $list { + if {[detectcode $item]} { + return 1 + } + } + return 0 + } + + #surprisingly - the ::join operation can be (relatively) slow on lists containing ansi proc detect_in_list2 {list} { detect [join $list " "] @@ -5915,6 +6045,8 @@ tcl::namespace::eval punk::ansi::ta { variable re_sgr expr {[regexp $re_sgr $text]} } + + #perl: ta_strip proc strip {text} { #*** !doctools #[call [fun strip] [arg text]] @@ -5922,6 +6054,39 @@ tcl::namespace::eval punk::ansi::ta { #[para]This is a tailcall to punk::ansi::ansistrip tailcall ansistrip $text } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::extract + @cmd -name punk::ansi::ta::extract\ + -summary\ + "Return only the ANSI codes in text"\ + -help\ + "This is the opposite of strip, + returning only the ANSI codes in text." + @values -min 1 -max 1 + text -type string + } ] + proc extract {text} { + set parts [split_codes $text] + set out "" + foreach {pt code} $parts { + append out $code + } + return $out + } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::length + @cmd -name punk::ansi::ta::length\ + -summary\ + "Calculate length of text (excluding the ANSI codes)"\ + -help\ + "Calculate length of text (excluding the ANSI codes) + This is not the printing length of the string on screen." + @values -min 1 + text -type string + } ] + #perl: ta_length proc length {text} { #*** !doctools #[call [fun length] [arg text]] @@ -5936,6 +6101,8 @@ tcl::namespace::eval punk::ansi::ta { # #} + #perl: ta_trunc + #truncate $text to $width columns while still including all the ANSI colour codes. proc trunc {text width args} { } @@ -8504,6 +8671,10 @@ tcl::namespace::eval punk::ansi::internal { } } +tcl::namespace::eval punk::ansi { + namespace import ::punk::ansi::ta::detect +} + #inserting into global namespace like this should be kept to a minimum.. but this is considered a core aspect of punk::ansi #todo - document interp alias {} ansistring {} ::punk::ansi::ansistring diff --git a/src/bootsupport/modules/punk/args-0.2.tm b/src/bootsupport/modules/punk/args-0.2.tm index fc438d57..a6224c0d 100644 --- a/src/bootsupport/modules/punk/args-0.2.tm +++ b/src/bootsupport/modules/punk/args-0.2.tm @@ -1364,6 +1364,11 @@ tcl::namespace::eval punk::args { #} tcl::dict::set tmp_optspec_defaults -type $v } + -defaultdisplaytype { + #how the -default is displayed + #-default doesn't have to be the same type as -type which validates user input that is not defaulted. + tcl::dict::set tmp_optspec_defaults -defaultdisplaytype $v + } -parsekey { tcl::dict::set tmp_optspec_defaults -parsekey $v @@ -1441,7 +1446,7 @@ tcl::namespace::eval punk::args { default { set known { -parsekey -group -grouphelp\ -any -anyopts -arbitrary -form -minsize -maxsize -choices -choicegroups -choicemultiple -choicecolumns -choicelabels -choiceinfo\ - -type -range -typeranges -default -typedefaults + -type -range -typeranges -default -defaultdisplaytype -typedefaults -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted -nocase\ -nominsize -nomaxsize -norange -nochoices -nochoicelabels\ -type -optional -allow_ansi -validate_ansistripped -strip_ansi -multiple -prefix\ @@ -2052,6 +2057,9 @@ tcl::namespace::eval punk::args { tcl::dict::set spec_merged -optional 1 } } + -defaultdisplaytype { + tcl::dict::set spec_merged -defaultdisplaytype $specval + } -typedefaults { set typecount [llength [tcl::dict::get $spec_merged -type]] if {$typecount != [llength $specval]} { @@ -2114,7 +2122,7 @@ tcl::namespace::eval punk::args { -form -type\ -parsekey -group\ -range -typeranges\ - -default -typedefaults\ + -default -defaultdisplaytype -typedefaults\ -minsize -maxsize -choices -choicegroups\ -choicemultiple -choicecolumns -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted\ -nocase -optional -multiple -validate_ansistripped -allow_ansi -strip_ansi -help\ @@ -3784,7 +3792,32 @@ tcl::namespace::eval punk::args { } if {[dict exists $arginfo -default]} { - set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + #default isn't necessarily of same type as -type required for validation + #Guessing at the type from the data is likely to be unsatisfactory. + + set defaultdisplaytype [Dict_getdef $arginfo -defaultdisplaytype string] + switch -- $defaultdisplaytype { + dict { + #single level + set rawdefault [dict get $arginfo -default] + set default "{\n" + dict for {k v} $rawdefault { + append default " \"$k\" \"$v\"\n" + } + append default "}" + } + list { + set default "{\n" + foreach v $rawdefault { + append default " \"$v\"\n" + } + append default "}" + } + default { + #set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + set default "'[dict get $arginfo -default]'" + } + } } else { set default "" } diff --git a/src/bootsupport/modules/punk/char-0.1.0.tm b/src/bootsupport/modules/punk/char-0.1.0.tm index f8123b94..69df08b9 100644 --- a/src/bootsupport/modules/punk/char-0.1.0.tm +++ b/src/bootsupport/modules/punk/char-0.1.0.tm @@ -89,6 +89,107 @@ tcl::namespace::eval punk::char { variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? + + #more useful for referring to ANSI documentation would be a proper 7-bit and 8-bit 'Code Table' layout + #as described in ECMA-35 5.2 + # where the positions of the table are in one-to-one correspondence with the bit combinations of the code. + #- for 7-bit: 8 columns 16 rows + #- for 8-bit 16 columns 16 rows + proc codetable {which} { + set bits 8 + switch -- $which { + ascii8 { + set which default + } + ascii { + set bits 7 + } + default { + if {$which ni [encoding names]} { + error "codetable unsupported - use 'ascii' or an entry from the result of the 'encoding names' command." + } + } + } + package require punk::ansi + + set hibit_count [expr {$bits-4}] + set bitcolumns [expr {2**$hibit_count}] ;#always 4 bits for the rows - remaining bits for the columns + set columncount [expr {$bitcolumns + 6}] + + + #set header1 [list "" "b7\nb6\nb5" "0\n0\n0" "0\n0\n1" "0\n1\n0" "0\n1\n1" "1\n0\n0" "1\n0\n1" "1\n1\n0" "1\n1\n1"] + set header1 [list] + set hibits_label "" + set indent "" + for {set hb $bits} {$hb > 4} {incr hb -1} { + append hibits_label ${indent}b$hb\n + append indent " " + } + set hibits_label [string range $hibits_label 0 end-1] + lappend header1 $hibits_label "" "" "" "" "" + + for {set colidx 0} {$colidx < $bitcolumns} {incr colidx} { + set binval [format %0${hibit_count}b $colidx] + set binvalbits [split $binval ""] + set indent "" + set display_hibits "" + foreach bb $binvalbits { + append display_hibits $indent$bb\n + append indent " " + } + set display_hibits [string range $display_hibits 0 end-1] + lappend header1 $display_hibits + } + #\u2193 down arrow + #right-down arrows + #\u2ba7 + #\u21b4 + #\u2b0e + set header2 [list " Bits" b4 b3 b2 b1 "column \u2192\nrow \u2b0e" {*}[punk::lib::range 0 $bitcolumns-1]] + set headers [list $header1 $header2] + + #set t [textblock::table -return tableobject -rows $rows] + set t [textblock::table -return tableobject] + #todo - fix textblock::table to allow configure -columncount + for {set c 0} {$c < $columncount} {incr c} { + $t add_column + } + + set vheaders [punk::transpose_equal_lists $headers] + set hidx -1 + foreach vh $vheaders { + incr hidx + $t configure_column $hidx -headers $vh + } + $t configure_header 0 -colspans [list 6 0 0 0 0 0 {*}[lrepeat $bitcolumns 1]] + $t configure_column 0 -blockalign left + + #always 16 rows - remaining bits form the columns + for {set ridx 0} {$ridx <= 15} {incr ridx} { + set charlist [list] + set lowbits [format %04b $ridx] + for {set i 0} {$i < $bitcolumns} {incr i} { + set hibits [format %0${hibit_count}b $i] + set ch [format %c [scan ${hibits}${lowbits} %b]] + #puts "-->${hibits}${lowbits} ch:$ch" + if {$which ne "default"} { + if {[catch {encoding convertfrom $which $ch} ch]} { + set ch [punk::ansi::a red bold]-[punk::ansi::a] + lappend charlist $ch + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } + set r [list "" {*}[split $lowbits ""] $ridx {*}$charlist] + $t add_row $r + } + puts stderr $t + $t print + } + #just the 7-bit ascii. use [page ascii] for the 8-bit layout proc ascii {} {return { 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL diff --git a/src/bootsupport/modules/punk/console-0.1.1.tm b/src/bootsupport/modules/punk/console-0.1.1.tm index 4322ceaa..ea8d3f77 100644 --- a/src/bootsupport/modules/punk/console-0.1.1.tm +++ b/src/bootsupport/modules/punk/console-0.1.1.tm @@ -1336,6 +1336,7 @@ namespace eval punk::console { #https://vt100.net/docs/vt510-rm/DA1.html # proc get_device_attributes {{inoutchannels {stdin stdout}}} { + #Note the vt52 rough equivalen \x1bZ - commonly supported but probably best considered obsolete as it collides with ECMA 48 SCI Single Character Introducer #DA1 variable last_da1_result #first element in result is the terminal's architectural class 61,62,63,64.. ? diff --git a/src/bootsupport/modules/punk/lib-0.1.2.tm b/src/bootsupport/modules/punk/lib-0.1.2.tm index 6ce76618..46cd5668 100644 --- a/src/bootsupport/modules/punk/lib-0.1.2.tm +++ b/src/bootsupport/modules/punk/lib-0.1.2.tm @@ -2242,6 +2242,121 @@ namespace eval punk::lib { } } + + #review - compare to IMAP4 methods of specifying ranges? + punk::args::define { + @id -id ::punk::lib::indexset_resolve + @cmd -name punk::lib::indexset_resolve\ + -summary\ + "Resolve an indexset to a list of integers based on supplied list or string length."\ + -help\ + "Resolve an 'indexset' to a list of actual indices within the range of the provided numitems value. + An indexset consists of a comma delimited list of indexes or index-ranges. + The indexes are 0-based. + Ranges must be specified with .. as the separator. + Whitespace is ignored. + Each index (or endpoint of an index-range) can be of the forms accepted by Tcl list or string commands, + e.g end-2 or 2+2. + + end means the last page. + end-1 means the second last page. + 0.. is the same as 0..end. + examples: + 1,3.. + output the page index 1 (2nd page) followed by all from index 3 to the end. + 0-2,end + output the first 3 pages, and the last page. + end-1..0 + output the indexes in reverse order from 2nd last page to first page." + @values -min 2 -max 2 + numitems -type integer + indexset -type string + } + proc indexset_resolve {numitems indexset} { + if {![string is integer -strict $numitems] || ![regexp {^[\-\+_end,\.0-9]*$} $indexset]} { + #use parser on unhappy path only + set errmsg [punk::args::usage -scheme error ::punk::lib::indexset_resolve] + uplevel 1 [list return -code error -errorcode {TCL WRONGARGS PUNK} $errmsg] + } + set index_list [list] ;#list of actual indexes within the range + set iparts [split $indexset ,] + set index_list [list] + foreach ipart $iparts { + set ipart [string trim $ipart] + set rposn [string first .. $ipart] + if {$rposn>=0} { + #range + lassign [punk::lib::string_splitbefore_indices $ipart $rposn $rposn+2] rawa _ rawb + set rawa [string trim $rawa] + set rawb [string trim $rawb] + if {$rawa eq ""} {set rawa 0} + set a [punk::lib::lindex_resolve $numitems $rawa] + if {$a == -3} { + #undershot - leave negative + } elseif {$a == -2 && $rawa ne "-2"} { + #overshot + set a [expr {$numitems}] ;#put it outside the range on the upper side + } + + if {$rawb eq ""} { + if {$a > $numitems-1} { + set rawb $a ;#make sure .. doesn't return last item - should return nothing + } else { + set rawb end + } + } + set b [punk::lib::lindex_resolve $numitems $rawb] + if {$b == -3} { + #undershot - leave negative + } elseif {$b == -2 && $rawb ne "-2"} { + set b [expr {$numitems}] ;#overshot - put it outside the range on the upper side + } + + #e.g make sure .. doesn't return last item - should return nothing as both are above the range. + if {$a >= 0 && $a <= $numitems-1 && $b >=0 && $b <= $numitems-1} { + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + if {$a >= 0 && $a <= $numitems-1} { + #only a is in the range + if {$b < 0} { + set b 0 + } else { + set b [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$b >=0 && $b <= $numitems-1} { + #only b is in the range + if {$a < 0} { + set a 0 + } else { + set a [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + #both outside the range + if {$a < 0 && $b > 0} { + #spans the range in forward order + set a 0 + set b [expr {$numitems-1}] + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$a > 0 && $b < 0} { + #spans the range in reverse order + set a [expr {$numitems-1}] + set b 0 + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } + #both outside of range on same side + } + } + } else { + set idx [punk::lib::lindex_resolve_basic $numitems $ipart] + if {$idx >= 0} { + lappend index_list $idx + } + } + } + return $index_list + } # showdict uses lindex_resolve results -2 & -3 to determine whether index is out of bounds on upper vs lower side #REVIEW: This shouldn't really need the list itself - just the length would suffice punk::args::define { @@ -2305,7 +2420,8 @@ namespace eval punk::lib { #<0 ? error "lindex_resolve len must be an integer" } - set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + set index [tcl::string::map {_ {}} $index] ;#basic forward compatibility with integers such as 1_000 for 8.6 + #todo - be stricter about malformations such as 1000_ if {[string is integer -strict $index]} { #can match +i -i if {$index < 0} { @@ -3345,8 +3461,12 @@ namespace eval punk::lib { #NOTE: running ta::detect on a list (or dict) as a whole can be problematic if items in the list have backslash escapes due to Tcl list quoting and escaping behaviour. #This commonly happens if there is an unbalanced brace (which is a normal occurrence and needs to be handled) #ta::detect on a list of ansi-containing string may appear to work for some simple inputs but is not reliable - #detect_in_list will check at first level. (not intended for detecting ansi in deeper structures) - if {![punk::ansi::ta::detect_in_list $linelist]} { + #detect_in_list/detectcode_in_list will check at first level. (not intended for detecting ansi in deeper structures) + + #we use detectcode_in_list instead of detect_in_list + #detectcode_in_list will detect unclosed (or unopened) paired sequences such as PM (privacy message) + # - but the main reason is it is slightly faster. + if {![punk::ansi::ta::detectcode_in_list $linelist]} { if {$opt_ansiresets} { foreach ln $linelist { lappend transformed $RST$ln$RST diff --git a/src/bootsupport/modules/punk/ns-0.1.0.tm b/src/bootsupport/modules/punk/ns-0.1.0.tm index f8e55b02..10fda84e 100644 --- a/src/bootsupport/modules/punk/ns-0.1.0.tm +++ b/src/bootsupport/modules/punk/ns-0.1.0.tm @@ -13,6 +13,9 @@ # @@ Meta End +#BUGS +# 2025-08 +# n// and n/// won't output info about 'namespace path' if there are no commands in the namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements @@ -1235,7 +1238,7 @@ tcl::namespace::eval punk::ns { if {[dict size [dict get $nsdict namespacepath]]} { set path_text "" if {!$opt_nspathcommands} { - append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]]" + append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]] (use n/// to display)" } else { append path_text \n " Also resolving cmds in namespace paths:" set nspathdict [dict get $nsdict namespacepath] @@ -1247,8 +1250,17 @@ tcl::namespace::eval punk::ns { } } else { #todo - change to display in column order to be same as main command listing + set parentcommands [dict get $nsdict commands] dict for {k v} $nspathdict { - set pathcommands [dict get $v commands] + set rawpathcommands [dict get $v commands] + set pathcommands [list] + foreach c $rawpathcommands { + if {$c in $parentcommands} { + lappend pathcommands [punk::ansi::a strike]$c[a] + } else { + lappend pathcommands $c + } + } set columns 6 if {[llength $pathcommands] < 6} { set columns [llength $v] diff --git a/src/bootsupport/modules/shellfilter-0.2.tm b/src/bootsupport/modules/shellfilter-0.2.tm index e04e5107..8017d3f5 100644 --- a/src/bootsupport/modules/shellfilter-0.2.tm +++ b/src/bootsupport/modules/shellfilter-0.2.tm @@ -579,13 +579,15 @@ namespace eval shellfilter::chan { #punk::ansi::ansistrip converts at least some of the box drawing G0 chars to unicode - todo - more complete conversion #assumes line-buffering. a more advanced filter required if ansicodes can arrive split across separate read or write operations! oo::class create ansistrip { - variable o_trecord + variable o_trecord variable o_enc + variable o_encbuf variable o_is_junction constructor {tf} { package require punk::ansi set o_trecord $tf set o_enc [dict get $tf -encoding] + set o_encbuf "" if {[dict exists $tf -junction]} { set o_is_junction [dict get $tf -junction] } else { @@ -614,10 +616,36 @@ namespace eval shellfilter::chan { method flush {transform_handle} { return "" } + #method write {transform_handle bytes} { + # #broken due to occasional unexpected byte sequence + # set instring [encoding convertfrom $o_enc $bytes] + # set outstring [punk::ansi::ansistrip $instring] + # return [encoding convertto $o_enc $outstring] + #} method write {transform_handle bytes} { - set instring [encoding convertfrom $o_enc $bytes] - set outstring [punk::ansi::ansistrip $instring] - return [encoding convertto $o_enc $outstring] + #set instring [tcl::encoding::convertfrom $o_enc $bytes] ;naive approach will break due to unexpected byte sequence - occasionally + #bytes can break at arbitrary points making encoding conversions invalid. + + set inputbytes $o_encbuf$bytes + set o_encbuf "" + set tail_offset 0 + while {$tail_offset < [string length $inputbytes] && [catch {tcl::encoding::convertfrom $o_enc [string range $inputbytes 0 end-$tail_offset]} stringdata]} { + incr tail_offset + } + if {$tail_offset > 0} { + if {$tail_offset < [string length $inputbytes]} { + #stringdata from catch statement must be a valid result + set t [expr {$tail_offset - 1}] + set o_encbuf [string range $inputbytes end-$t end] + } else { + set stringdata "" + set o_encbuf $inputbytes + return "" + } + } + + set outstring [punk::ansi::ansistrip $stringdata] + return [tcl::encoding::convertto $o_enc $outstring] } method meta_is_redirection {} { return $o_is_junction @@ -724,6 +752,8 @@ namespace eval shellfilter::chan { set emit "" if {[string last \x1b $buf] >= 0} { #detect will detect ansi SGR and gron groff and other codes + #REVIEW - ta::detect won't detect SOS without paired ST for things like PM + # ta::detectcode will - but then split_codes_single will treat unpaired SOS as text? if {[punk::ansi::ta::detect $buf]} { #split_codes_single regex faster than split_codes - but more resulting parts #'single' refers to number of escapes - but can still contain e.g multiple SGR codes (or mode set operations etc) @@ -1035,7 +1065,7 @@ namespace eval shellfilter::chan { } #todo - something oo::class create rebuffer { - variable o_trecord + variable o_trecord variable o_enc constructor {tf} { set o_trecord $tf diff --git a/src/bootsupport/modules/textblock-0.1.3.tm b/src/bootsupport/modules/textblock-0.1.3.tm index de0164cd..97969463 100644 --- a/src/bootsupport/modules/textblock-0.1.3.tm +++ b/src/bootsupport/modules/textblock-0.1.3.tm @@ -2528,7 +2528,7 @@ tcl::namespace::eval textblock { set ansiborder_final $ansiborder_body_col_row #$c will always have ansi resets due to overtype behaviour ? #todo - review overtype - if {[punk::ansi::ta::detect $c]} { + if {[punk::ansi::ta::detectcode $c]} { #use only the last ansi sequence in the cell value #Filter out foreground and use background for ansiborder override set parts [punk::ansi::ta::split_codes_single $c] @@ -4377,6 +4377,7 @@ tcl::namespace::eval textblock { if {![punk::ansi::ta::detect $block]} { return $block } + #could be newine in SOS/PM? review - terminal should theoretically ignore all chars til close of string foreach ln [split $block \n] { set parts [punk::ansi::ta::split_codes $ln] if {[lindex $parts 0] eq ""} { @@ -4894,7 +4895,9 @@ tcl::namespace::eval textblock { } set textblock [textutil::tabify::untabify2 $textblock $tw] } - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { #ansistripraw slightly faster than ansistrip - and won't affect width (avoid detect_g0/conversions) set textblock [punk::ansi::ansistripraw $textblock] } @@ -4917,7 +4920,9 @@ tcl::namespace::eval textblock { } else { set tl $textblock } - if {[punk::ansi::ta::detect $tl]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[punk::ansi::ta::detectcode $tl]} { set tl [punk::ansi::ansistripraw $tl] } return [punk::char::ansifreestring_width $tl] @@ -4964,7 +4969,7 @@ tcl::namespace::eval textblock { set textblock [textutil::tabify::untabify2 $textblock $tw] } #ansistripraw on entire block in one go rather than line by line - result should be the same - review - make tests - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { set textblock [punk::ansi::ansistripraw $textblock] } if {[tcl::string::last \n $textblock] >= 0} { @@ -5211,7 +5216,7 @@ tcl::namespace::eval textblock { set known_hasansi [tcl::dict::get $opts -known_hasansi] if {$known_hasansi eq ""} { - set block_has_ansi [punk::ansi::ta::detect $block] + set block_has_ansi [punk::ansi::ta::detectcode $block] } else { set block_has_ansi $known_hasansi } @@ -5648,7 +5653,7 @@ tcl::namespace::eval textblock { set rowcount 0 set blocklists [list] foreach b $blocks { - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set bl [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] } else { @@ -5676,7 +5681,7 @@ tcl::namespace::eval textblock { set i -1 foreach b $args { incr i - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set blines [punk::lib::lines_as_list -ansireplays 1 -ansiresets auto -- $b] } else { @@ -5799,7 +5804,7 @@ tcl::namespace::eval textblock { #testing of any decent size block line by line - even with max_string_length_line is slower than ta::detect anyway. #blocks passed to join can be ragged - so we can't pass -known_samewidth to pad - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { # - we need to join to use pad - even though we then need to immediately resplit REVIEW (make line list version of pad?) set replay_block [::join [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] \n] #set blines [split [textblock::pad $replay_block -known_hasansi 1 -known_samewidth $is_samewidth -known_blockwidth $bwidth -width $bwidth -which right -padchar " "] \n] @@ -8650,7 +8655,7 @@ tcl::namespace::eval textblock { set cache_inner [overtype::block -blockalign $opt_blockalign -ellipsis $opt_ellipsis $underlay $cache_contentpattern] #puts "frame--->ansiwrap -rawansi [ansistring VIEW $opt_ansibase] $cache_inner" if {$opt_ansibase ne ""} { - if {[punk::ansi::ta::detect $cache_inner]} { + if {[punk::ansi::ta::detectcode $cache_inner]} { #set cache_inner [punk::ansi::ansiwrap -rawansi $opt_ansibase $cache_inner] #set cache_inner [punk::ansi::ansiwrap_raw $opt_ansibase "" "" $cache_inner] #jjj ??? review @@ -8745,7 +8750,7 @@ tcl::namespace::eval textblock { #set cwidth [textblock::width $contents] #JJJ - set contents_has_ansi [punk::ansi::ta::detect $contents] + set contents_has_ansi [punk::ansi::ta::detectcode $contents] if {$opt_ansibase ne ""} { if {$contents_has_ansi} { #set contents [punk::ansi::ansiwrap -rawansi $opt_ansibase $contents] @@ -8799,8 +8804,9 @@ tcl::namespace::eval textblock { if {$subposn >= 0} { set content_line [lindex $clines $contentindex] #review - different forms of reset e.g \x1b\[m ?? - if {[string range $content_line 0 3] eq "\x1b\[0m"} { + if {[tcl::string::range $content_line 0 3] eq "\x1b\[0m"} { set content_line [tcl::string::range $content_line 4 end] + #::tcl::string::replace content_line 0 3 } append content_line $opt_ansibase append fs [tcl::string::replace $tline $subposn $subposn+$pattern_offset $content_line] \n diff --git a/src/modules/punk/args-999999.0a1.0.tm b/src/modules/punk/args-999999.0a1.0.tm index b35a0094..8e55fdd5 100644 --- a/src/modules/punk/args-999999.0a1.0.tm +++ b/src/modules/punk/args-999999.0a1.0.tm @@ -1364,6 +1364,11 @@ tcl::namespace::eval punk::args { #} tcl::dict::set tmp_optspec_defaults -type $v } + -defaultdisplaytype { + #how the -default is displayed + #-default doesn't have to be the same type as -type which validates user input that is not defaulted. + tcl::dict::set tmp_optspec_defaults -defaultdisplaytype $v + } -parsekey { tcl::dict::set tmp_optspec_defaults -parsekey $v @@ -1441,7 +1446,7 @@ tcl::namespace::eval punk::args { default { set known { -parsekey -group -grouphelp\ -any -anyopts -arbitrary -form -minsize -maxsize -choices -choicegroups -choicemultiple -choicecolumns -choicelabels -choiceinfo\ - -type -range -typeranges -default -typedefaults + -type -range -typeranges -default -defaultdisplaytype -typedefaults -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted -nocase\ -nominsize -nomaxsize -norange -nochoices -nochoicelabels\ -type -optional -allow_ansi -validate_ansistripped -strip_ansi -multiple -prefix\ @@ -2052,6 +2057,9 @@ tcl::namespace::eval punk::args { tcl::dict::set spec_merged -optional 1 } } + -defaultdisplaytype { + tcl::dict::set spec_merged -defaultdisplaytype $specval + } -typedefaults { set typecount [llength [tcl::dict::get $spec_merged -type]] if {$typecount != [llength $specval]} { @@ -2114,7 +2122,7 @@ tcl::namespace::eval punk::args { -form -type\ -parsekey -group\ -range -typeranges\ - -default -typedefaults\ + -default -defaultdisplaytype -typedefaults\ -minsize -maxsize -choices -choicegroups\ -choicemultiple -choicecolumns -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted\ -nocase -optional -multiple -validate_ansistripped -allow_ansi -strip_ansi -help\ @@ -3784,7 +3792,32 @@ tcl::namespace::eval punk::args { } if {[dict exists $arginfo -default]} { - set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + #default isn't necessarily of same type as -type required for validation + #Guessing at the type from the data is likely to be unsatisfactory. + + set defaultdisplaytype [Dict_getdef $arginfo -defaultdisplaytype string] + switch -- $defaultdisplaytype { + dict { + #single level + set rawdefault [dict get $arginfo -default] + set default "{\n" + dict for {k v} $rawdefault { + append default " \"$k\" \"$v\"\n" + } + append default "}" + } + list { + set default "{\n" + foreach v $rawdefault { + append default " \"$v\"\n" + } + append default "}" + } + default { + #set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + set default "'[dict get $arginfo -default]'" + } + } } else { set default "" } diff --git a/src/modules/punk/pdf-999999.0a1.0.tm b/src/modules/punk/pdf-999999.0a1.0.tm index 17e2f8b7..df74d26c 100644 --- a/src/modules/punk/pdf-999999.0a1.0.tm +++ b/src/modules/punk/pdf-999999.0a1.0.tm @@ -82,70 +82,126 @@ namespace eval ::punk::pdf { variable base_x_per_c 4 ;#fixed width output font's x_per_c. variable cfg_blocksep_default "#pdf::text:blockstart type %type% pageindex %pageindex% blockindex %blockindex% y-index %y-index% tlc {%x0% %y0%} brc {%brc%} warnings %warnings% : %marker%" variable cfg_pagesep_default "#pdf::text:pagestart index %index% pageblocks %pageblocks% textblocks %textblocks% imageblocks %imageblocks% : [a bold green]PAGE[a]" + variable cfg_header_default + set cfg_header_default \ +"#pdf::text::header doc %doc% pages %pages% size %size% sha1 %sha1% punk::pdf %parserversion% +# args +# %args%" + + variable callid 0 variable results + variable test array set results {} punk::args::define { @id -id ::punk::pdf::text - @cmd -name punk::pdf::text -help\ + @cmd -name punk::pdf::text\ + -summary\ + "Extract text from non-image parts of a PDF"\ + -help\ "Extract text lines from a pdf - (No OCR - will only retrieve actual text contents)" + (No OCR - will only retrieve actual text contents) + The text function can only operate on the text spans provided by the underlying + engine (MuPDF) - which in some cases (for example spaces adjacent to brackets in + PDF32000_2008.pdf p24 eg4) does not interpret the number of whitespace characters + as a human reader would. The text data supplied for formulas such as in 7.10.1 of + the above document can be missing symbols such as the square-root symbol. This would + have to be converted to a unicode square root symbol with additional bracketing to + maintain meaning. The positioning of subscript numerals would also need to be + processed - and without font size information from tclMuPDF - this is currently + not practical to process reasonably. Such formulas will come out mangled. + " @leaders -min 0 -max 0 @opts -min 0 -max 2 -p|-page_indexes -parsekey -page_indexes -type string -default "0.." -help\ "Comma delimited list of indexes and/or ranges specifying which pages to output. - The indexes are 0-based. - Ranges must be specified with .. as the separator. - end means the last page. - end-1 means the second last page. - 0.. is the same as 0..end. - examples: - 1,3.. - output the page index 1 (2nd page) followed by all from index 3 to the end. - 0-2,end - output the first 3 pages, and the last page. - end-1..0 - output the pages in reverse order from 2nd last page to first page." + The indexes are 0-based. + Ranges must be specified with .. as the separator. + end means the last page. + end-1 means the second last page. + 0.. is the same as .. and is the same as 0..end + examples: + 1,3.. + output the page index 1 (2nd page) followed by all from index 3 to the end. + 0..2,end + output the first 3 pages, and the last page. + end-1..0 + output the pages in reverse order from 2nd last page to first page." -b|-block_indexes -parsekey -block_indexes -type string -default "0.." -help\ "Comma delimited list of indexes and/or ranges specifying which blocks to output. - Format is as per -page_indexes" - -merge_yblocks -default true -help\ - "Whether to merge blocks vertically when they have the same starting y-index. - This usually makes the output more in line with the source document - but - in some scenarios it might be desired to keep the blocks listed vertically - even though they appear side by side in the source." + Format is as per -page_indexes" + -merge_yblocks -default false -help\ + "Whether to merge blocks vertically when they interleave in the y direction. + This usually makes the output more in line with the source document - but + in some scenarios it might be desired to keep the blocks listed vertically + even though they appear side by side in the source. Such vertical listing as + given by default when -merge_yblocks is false, does not necessarily match the + left to right order. + Note that especially when there is small-font text with other text to the right, + The small-font text that is not rightmost may be truncated (with trailing ellipsis) + to fit it into the vertically merged layout. Such truncation produces warnings on + the -errchannel and increments the warning count visible in the -sep-block line + preceding the merged blocks. + Smaller print that doesn't fit in the layout is sometimes not required to extract + the main data from a document such as an invoice - and the merged data can make + other elements more amenable to parsing. In practice, 2 passes with -merge_yblocks + configured on and off, combined with -sep_block data and a set of custom rules + based on the specific document set may be required for information extraction." + #------------------------------------------------------------------------------------ -compact -type boolean -default false -help\ "If compact is false - blank lines approximating the vertical spacing in the source document will be emitted between each block. If -merged_yblocks is true - there will still be potential vertical whitespace within any merged blocks, as the point of -merge_yblocks is to align elements - closely to that within the source document, but there will be no vertical layout - spacing between such blocks, or other non-merging blocks." - -image_place_ansi -type dict -default {border Term-grey78 inner Term-grey58 small Term-grey15} -help\ - "3-element list of ANSI colour codes for the textual image placeholders. + closely to that within the source document, but there will be no vertical + layout spacing between such merged blocks, or other non-merging blocks. + This setting is redundant if -postcompact is true." + -postcompact -type boolean -default false -help\ + "Strip out vertical spacing (empty or whitespace-only lines) from all blocks. + This sets -compact to true, and removes vertical gaps from any merged blocks. + The most vertically compact representation of the text will result from setting + -merge true, -postcompact true and -image_place_ansi to the empty string, as + well as omitting (or setting to empty string) -sep_page and -sep_block. + Note that if the source PDF actually positioned blank lines as strings of + whitespace, these too would be removed. + If -image_place_ansi is left as the default or has other ansi codes, the + image placeholders will not be stripped, even if they consist of the default + space characters. + If -sep_block or -sep_page are set to one or more spaces - these blank lines + will still be emitted." + -shrink_textfree_blocks -type boolean -default true -help\ + "If a block is image only - don't output the placeholders. + These placeholders will still be emitted if -merge_yblocks is true and they + are in a merged block that has also merged text blocks. Set -image_place_ansi + to the empty string to suppress ANSI background for all image placeholders." + -image_place_ansi -type dict -defaultdisplaytype dict -default {border Term-grey78 inner Term-grey58 small Term-grey15} -help\ + "Dict of ANSI colour codes for the textual image placeholders. Colour codes are those as listed by 'punk::ansi::a?' - If the default -image_place_chars values of the space character are being used, the colour - names should begin with a capital letter (indicating background as opposed to foreground colour). + If the default -image_place_chars values of the space character are being used, + the colour names should begin with a capital letter (indicating background as + opposed to foreground colour). The placeholders are by default space characters with the ANSI applied. - This means that just by stripping ANSI from the output, layout is maintained, and the - text can be processed simply, or the ANSI can be retained if your text parser wishes - to use it to make decisions. - The colour at element 'border' is for the border colour, 'inner' for the area within the border, - and the colour at element 'small' is for images too small for border representation. The border - bears no relation to the underlying image - and is merely to aid in seeing when images overlap - or are cropped etc. The dimensions of the placeholder are approximates only to the source - images, as they are in multiples of the fixed-width output font character width and - line height. - Setting -image_place_ansi to an empty value, or -image_place_chars to an empty value - will stop ANSI from being output at the image locations." - -image_place_chars -type dict -default {tlc " " hlt " " trc " " vlr " " brc " " hlb " " blc " " vll " " inner " " small " "} -help\ - "Characters to use for image placeholders. The default list of empty spaces is recommended - for ease of visualisation and output parsing assuming an function such as punk::ansi::ansistrip - is available. An alternative set of characters such as a unicode block character might be useful - alongside turning ANSI off by setting -image_place_ansi to empty; if the unicode character is known - to be outside the domain of characters in the source document, and it is desired to use such chars - to detect approximate image position in relation to text. + This means that just by stripping ANSI from the output, layout is maintained, + and the text can be processed simply, or the ANSI can be retained if your text + parser wishes to use it to make decisions. + The colour at element 'border' is for the border colour, 'inner' for the area + within the border, and the colour at element 'small' is for images too small for + border representation. The border bears no relation to the underlying image - and + is merely to aid in seeing when images overlap or are cropped etc. The dimensions + of the placeholder are approximates only to the source images, as they are in + multiples of the fixed-width output font character width and line height. + Setting -image_place_ansi to an empty value, or -image_place_chars to an empty + value will stop ANSI from being output at the image locations." + #------------------------------------------------------------------------------------ + -image_place_chars -type dict -defaultdisplaytype dict -default {tlc " " hlt " " trc " " vlr " " brc " " hlb " " blc " " vll " " inner " " small " "} -help\ + "Characters to use for image placeholders. The default list of empty spaces is + recommended for ease of visualisation and output parsing assuming an function + such as punk::ansi::ansistrip is available. An alternative set of characters + such as a unicode block character might be useful alongside turning ANSI off by + setting -image_place_ansi to empty; if the unicode character is known to be + outside the domain of characters in the source document, and it is desired to + use such chars to detect approximate image position in relation to text. tlc - border top left corner hlt - border top horizontal lines trc - border top right corner @@ -157,40 +213,51 @@ namespace eval ::punk::pdf { inner - image area within border small - for images too small for borders " + -header -type string -default "" -help\ + "Header line to record details of the conversion run. + Set to 'default' to use the provided default template. + Substitutions available: + %doc% - full path of source pdf + %pages% - number of pages in the pdf + %size% - size in bytes of the input pdf file + %sha1% - sha1 checksum of the input pdf file + %parserversion% - version of punk::pdf used. + %args% - arguments supplied to text command. + %default% - the default header template - for extending." -sep_page -type string -default "" -help\ "Send a line of output to stdout at the beginning of each - page in the PDF. - If set to the value: default - a default separator with some added info will be used. - line begins with: #pdf::text:pagestart - Substitutions available: - %index% 0-based index of the page (not the page number as in the PDF) - %pageblocks% total number of blocks in the page, includes both - text and image blocks. - %textblocks% number of text blocks in the page - %imageblocks% number of image blocks in the page - %nl% newline - %default% The default page sep - to allow extending." + page in the PDF. + If set to the value: default + a default separator with some added info will be used. + line begins with: #pdf::text:pagestart + Substitutions available: + %index% 0-based index of the page (not the page number as in the PDF) + %pageblocks% total number of blocks in the page, includes both + text and image blocks. + %textblocks% number of text blocks in the page + %imageblocks% number of image blocks in the page + %nl% newline + %default% The default page sep - to allow extending." -sep_block -type string -default "" -help\ "Send a line of output to stdout at the beginning of each - textblock in the PDF. This can aid in parsing. - If set to the the value: default - a default separator with some added info will be used. - line begins with: #pdf::text:blockstart - Substitutions available: - %type% type of block text or image - %pageindex% 0-based index of the page (not the page number as in the PDF) - %y-index% 0-based index of block after the textblock list has - been sorted by y0 - %x0% first x coordinate of the block - %y0% first y coordinate of the block - %x1% second x coordinate of the block - %y1% second y coordinate of the block - %nl% newline - %warnings% Number of warnings emitted to stderr for the block. - These are normally for text overlay attempts, which should - only affect whitespace, but should be checked. - %default% The default blocksep - to allow extending." + textblock in the PDF. This can aid in parsing. + If set to the the value: default + a default separator with some added info will be used. + line begins with: #pdf::text:blockstart + Substitutions available: + %type% type of block text or image + %pageindex% 0-based index of the page (not the page number as in the PDF) + %y-index% 0-based index of block after the textblock list has + been sorted by y0 + %x0% first x coordinate of the block + %y0% first y coordinate of the block + %x1% second x coordinate of the block + %y1% second y coordinate of the block + %nl% newline + %warnings% Number of warnings emitted to stderr for the block. + These are normally for text overlay attempts, which should + only affect whitespace, but should be checked. + %default% The default blocksep - to allow extending." -outchannel -type string -default stdout -choicerestricted 0 -choices {return null stderr stdout}\ -choicelabels { return\ @@ -215,33 +282,33 @@ namespace eval ::punk::pdf { any open Tcl channel can be used." -debug_pageblock -default "" -type list -minsize 2 -maxsize 2 -help\ "A 2-element list of the page then block index(s) for which to output - extra information on stderr. - Supplied value must contain 2 elements {indexset indexset} - The membership selection is performed the same way as -page_indexes. - e.g {.. end-1} means second-last block on every page." + extra information on stderr. + Supplied value if not empty, must contain 2 elements {indexset indexset} + The membership selection is performed the same way as -page_indexes. + e.g {.. end-1} means second-last block on every page." -debug_highlight -default "red bold" -help\ "ANSI codes as understood by punk::ansi (see punk::ansi::a?) - These are used to colourise output if -debug_pageblock has been set" + These are used to colourise output if -debug_pageblock has been set" -highlight_overwrites -default "bold" -help\ "Ansi codes as understood by punk::ansi (see punk::ansi::a?) - These are used to mark text elements which have been overlayed with - the same text data. This may occur for example in table rows in the source - document that have alternating colour or emphasis. - Having this ANSI applied can be useful when parsing the output to aid in - grouping lines. e.g by using [punk::ansi::ta::detect ] or similar. - - By applying ansi, the layout is not disturbed when viewing in a terminal, - and may be more visually similar to the source document. - Saved output can always be passed through an ansistrip function if needed, - or the default -highlight_overwrites can be overridden by supplying an empty - string." - + These are used to mark text elements which have been overlayed with + the same text data. This may occur for example in table rows in the source + document that have alternating colour or emphasis. + Having this ANSI applied can be useful when parsing the output to aid in + grouping lines. e.g by using [punk::ansi::ta::detect ] or similar. + + By applying ansi, the layout is not disturbed when viewing in a terminal, + and may be more visually similar to the source document. + Saved output can always be passed through an ansistrip function if needed, + or the default -highlight_overwrites can be overridden by supplying an empty + string." + -engine -type any -default MuPDF -choices {MuPDF} -help\ "Only MuPDF is currently implemented. Be aware that while this script and tclMuPDF - are MIT/BSD licensed, the underlying MuPDF library is AGPL - which could place - some restrictions on commercial use." + are MIT/BSD licensed, the underlying MuPDF library is AGPL - which could place + some restrictions on commercial use." -warnings_engine -default 0 -choices {0 1 2}\ -choicelabels { 0\ @@ -253,7 +320,8 @@ namespace eval ::punk::pdf { }\ -help\ "Whether to display internal warnings regarding the state of the PDF document. - The underlying MuPDF library can in many cases work with bad/corrupted PDFS." + The underlying MuPDF library can in many cases work with bad/corrupted PDFS, + and if so, may emit warnings." -warnings_textblock -default 2 -type integer -range {0 9} -help\ "0 to disable. 1 for least info, 9 for most. @@ -263,12 +331,20 @@ namespace eval ::punk::pdf { "Path to the .pdf file to parse" } proc text {args} { + variable callid incr callid + + package require tcl::chan::variable + #variable testchan + #set testchan [::tcl::chan::variable ::punk::pdf::test($callid)] + #chan configure $testchan -translation lf + variable results variable base_x_per_c variable cfg_blocksep_default variable cfg_pagesep_default + variable cfg_header_default set x_per_line [expr {$base_x_per_c * 2}] ;#hack for now set argd [punk::args::parse $args withid ::punk::pdf::text] @@ -282,6 +358,10 @@ namespace eval ::punk::pdf { if {$opt_blocksep eq "default"} { set opt_blocksep $cfg_blocksep_default } + set opt_header [dict get $opts -header] + if {$opt_header eq "default"} { + set opt_header $cfg_header_default + } set debug_pageblock [dict get $opts -debug_pageblock] set debug_highlight [string trim [dict get $opts -debug_highlight]] set highlight_overwrites [string trim [dict get $opts -highlight_overwrites]] @@ -289,8 +369,14 @@ namespace eval ::punk::pdf { set block_indexes [string trim [dict get $opts -block_indexes]] set warnings_engine [dict get $opts -warnings_engine] set opt_compact [dict get $opts -compact] + set opt_postcompact [dict get $opts -postcompact] + if {$opt_postcompact} { + set opt_compact 1 ;#don't emit inter-block spacing in the first place + #will also strip vertical space merged blocks + } set opt_image_place_ansi [dict get $opts -image_place_ansi] set opt_image_place_chars [dict get $opts -image_place_chars] + set opt_shrink_textfree_blocks [dict get $opts -shrink_textfree_blocks] if {$warnings_engine == 2} { #turn on printwarnings before we start ::mupdf::printwarnings 1 @@ -302,6 +388,8 @@ namespace eval ::punk::pdf { return { package require tcl::chan::variable set outchan [::tcl::chan::variable ::punk::pdf::results($callid)] + #chan configure $outchan -translation binary -encoding utf-8 + chan configure $outchan -translation lf -encoding utf-8 } null { package require tcl::chan::null @@ -320,7 +408,8 @@ namespace eval ::punk::pdf { } else { set errchan [::tcl::chan::variable ::punk::pdf::results($callid)] } - chan configure $errchan -buffering none + #chan configure $errchan -buffering none -translation binary + chan configure $errchan -buffering none -translation lf -encoding utf-8 } null { package require tcl::chan::null @@ -347,6 +436,13 @@ namespace eval ::punk::pdf { set d [::mupdf::open $fname] set npages [$d npages] + if {$opt_header ne ""} { + set opt_header [string map [list %default% $cfg_header_default] $opt_header] + package require sha1 + set hash [sha1::sha1 -hex -file $fname] + set map [list %nl% \n %crlf% \r\n %doc% $fname %pages% $npages %size% [file size $fname] %sha1% $hash %args% $args %parserversion% [package present punk::pdf]] + puts $outchan [string map $map $opt_header] + } set page_index_list [punk::lib::indexset_resolve $npages $page_indexes] set debug_pageblock_pages [punk::lib::indexset_resolve $npages [lindex $debug_pageblock 0]] @@ -495,21 +591,33 @@ namespace eval ::punk::pdf { # } #} #------------ - set cur_merge_base_fields [dict get [lindex $MERGE_BUFFER 0 0] header fields] - if {$bny0 >= [dict get $cur_merge_base_fields y1]} { - #this block begins below the entire MERGE_BUFFER - time to render it - _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $warnings_textblock $outchan $errchan - if {!$opt_compact} { - #default: -compact false - set vdist [expr {$bny0 - [dict get $cur_merge_base_fields y1]}] - set vdistlines [expr {int(ceil($vdist / $x_per_line))}] - if {$vdistlines > 0} { - set vseparator [string repeat \n $vdistlines] - puts -nonewline $outchan $vseparator + #puts "MB dimensions: [punk::pdf::system::merge_buffer_dimensions MERGE_BUFFER]" + set mbdimensions [punk::pdf::system::merge_buffer_dimensions MERGE_BUFFER] ;#list x0 y0 x1 y1 + #set cur_merge_base_fields [dict get [lindex $MERGE_BUFFER 0 0] header fields] + #if {$bny0 >= [dict get $cur_merge_base_fields y1]} {} + if {$bny0 >= [lindex $mbdimensions 3]} { + + #this block begins below the entire MERGE_BUFFER - time to render previous block + _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $opt_shrink_textfree_blocks $opt_postcompact $warnings_textblock $outchan $errchan + + if {$tbnum in $block_index_list} { + #now this block's offset from the top + if {!$opt_compact} { + #default: -compact false + #set vdist [expr {$bny0 - [dict get $cur_merge_base_fields y1]}] + set vdist [expr {$bny0 - [lindex $mbdimensions 3]}] + #set vdistlines [expr {int(ceil($vdist / $x_per_line))}] + set vdistlines [expr {int(floor($vdist / $x_per_line))}] + if {$vdistlines > 0} { + set vseparator [string repeat \n $vdistlines] + puts -nonewline $outchan $vseparator + } } } } else { - lappend MERGE_BUFFER [list] + if {$tbnum in $block_index_list} { + lappend MERGE_BUFFER [list] + } } } @@ -577,9 +685,17 @@ namespace eval ::punk::pdf { set vertx [expr {$bny1 - $bny0}] set img_char_height [expr {int(ceil($vertx / $x_per_line))}] ;#linecount - #puts "-- imgstart_char_pos:$imgstart_char_pos img_char_width:$img_char_width" + #puts "-- img_char_height:$img_char_height img_char:width: $img_char_width" + if {$img_char_height == 0} { + puts $errchan "WARNING type debug msg {zero height image}" ;#todo + incr block_warning_count ;#0 based + #nothing to render for a zero height img + #MERGE_BUFFER will have header + lset MERGE_BUFFER end end+1 [list warnings $block_warning_count] + continue + } - #put a border around the image if it's big enough + #put a border around the image if it's big enough #if it can't be bordered use a different colour #this gives us 3 greys applied to space elements, to aid in seeing image overlaps and positioning. #all can be easily stripped out with ansistrip to leave only spaces for text processing @@ -629,14 +745,24 @@ namespace eval ::punk::pdf { set a_inner [dict get $opt_image_place_ansi inner] set b [textblock::frame -type [list tlc $c_tlc hlt $c_hlt trc $c_trc vlr $c_vlr brc $c_brc hlb $c_hlb blc $c_blc vll $c_vll] -ansiborder [a $a_border] [punk::ansi::ansiwrap $a_inner $inner]] } - } + } #set b [textblock::block $img_char_width $img_char_height " "] #set b [punk::ansi::ansiwrap Term-102 $b] set img_indent [textblock::block [expr {max(0,$imgstart_char_pos)}] $img_char_height \uFFFD] if {$imgstart_char_pos < 0} { #img representation is effectively cropped at left #img_indent will just be a list of newlines to match height + + #even if img_indent and b have no ansi - result of renderspace will have ansi resets (todo review overtype::renderspace) + set had_ansi 0 + if {[punk::ansi::ta::detect $b]} { + set had_ansi 1 + } set positioned_block [overtype::renderspace -insert_mode 0 -expand_right 1 -startcolumn $imgstart_char_pos $img_indent $b] + if {!$had_ansi} { + #if user disabled ansi on image_placeholders - we need to ensure it still has none after renderspace + set positioned_block [punk::ansi::ansistrip $positioned_block] + } } else { set positioned_block [textblock::join -- $img_indent $b] } @@ -663,7 +789,7 @@ namespace eval ::punk::pdf { #(common to encounter line data where only some text elements are within the block we are currently at - e.g tables) #todo - review treatment of #assume if it just extends out of the blocks x range - it's part of a larger block (?) review - xrangefudge? - set has_element_in_range 0 + set has_element_in_range 0 foreach xxt $ydata { lassign $xxt dx0 dx1 set xrangefudge 0 ;#shouldn't be needed? @@ -682,7 +808,7 @@ namespace eval ::punk::pdf { lassign $debug_pageblock dp db if {$pageindex in $debug_pageblock_pages && $tbnum in $debug_pageblock_blocks} { - set debuginfo "b:$tbnum yinfo: $yinfo ydata: $ydata" + set debuginfo "b:$tbnum yinfo: $yinfo ydata: [punk::ansi::ansistring VIEW -lf 1 $ydata]" if {$debug_highlight ne ""} { set debuginfo "[punk::ansi::a {*}$debug_highlight]$debuginfo\x1b\[m" } @@ -717,23 +843,44 @@ namespace eval ::punk::pdf { set existing_y0s [lsearch -all -inline -index 0 -subindices $vfudged_lines *] set existing_y1s [lsearch -all -inline -index 1 -subindices $vfudged_lines *] set vfudged_line_added 0 + #set vfidx -1 + #foreach ey0 $existing_y0s { + # incr vfidx + # set vdiff [expr {abs($line_y0 - $ey0)}] + # #x units per char is normally narrower than height + # set linefudge 1.5 ;#choose linefudge less than assumed x_per_line of 2*base_x_per_c - or small font lines will not be separated #REVIEW + # if {$vdiff < ($linefudge * $base_x_per_c)} { + # #normalize to existing value + # #line_y1 unnormalised?? + # #if y1 vdiff is about the same - normalise it too + # set ey1 [lindex $existing_y1s $vfidx] + # set vdiff2 [expr {abs($line_y1 -$ey1)}] + # if {abs($vdiff2 - $vdiff) <= 8} { + # set line_y1 $ey1 + # } + # set vfudged_line_added 1 + # lappend vfudged_lines [list $ey0 $line_y1 [lindex $fl 2]] + # break + # } + #} + + #when mixing fonts in same line - aligning the bases is more common. set vfidx -1 - foreach ey0 $existing_y0s { + foreach ey1 $existing_y1s { incr vfidx - set vdiff [expr {abs($line_y0 - $ey0)}] + set vdiff [expr {abs($line_y1 - $ey1)}] #x units per char is normally narrower than height set linefudge 1.5 ;#choose linefudge less than assumed x_per_line of 2*base_x_per_c - or small font lines will not be separated #REVIEW if {$vdiff < ($linefudge * $base_x_per_c)} { #normalize to existing value - #line_y1 unnormalised?? - #if y1 vdiff is about the same - normalise it too - set ey1 [lindex $existing_y1s $vfidx] - set vdiff2 [expr {abs($line_y1 -$ey1)}] + #if y0 vdiff is about the same - normalise it too + set ey0 [lindex $existing_y0s $vfidx] + set vdiff2 [expr {abs($line_y0 -$ey0)}] if {abs($vdiff2 - $vdiff) <= 8} { - set line_y1 $ey1 + set line_y0 $ey0 } set vfudged_line_added 1 - lappend vfudged_lines [list $ey0 $line_y1 [lindex $fl 2]] + lappend vfudged_lines [list $line_y0 $ey1 [lindex $fl 2]] break } } @@ -774,7 +921,7 @@ namespace eval ::punk::pdf { incr block_linenum #puts "fline: $fl" lassign $fl fy0 fy1 spans_by_x0 ;#don't pull out spans_by_x0_adjusted - it's adjusted within this loop by referring to its index in working_lines - + set line_chunks [list] #calculating x units per char for each string is only really possible if we know the exact font and size set xidx -1 ;#index of spans in line, which are sorted by x0 (lhs) @@ -783,6 +930,8 @@ namespace eval ::punk::pdf { incr xidx lassign $xxt tx0 tx1 text + #puts -nonewline $testchan $text + #any span, including first may have rhs alignments with spans from previous lines in the same block #look at original rhs alignment of spans in previous lines that match this one's x1 @@ -850,6 +999,7 @@ namespace eval ::punk::pdf { #1st shot - use rhs of element immediately to the left (either existing TESTBLOCK or earliear spans on same line) if {$xidx == 0} { + set prevspan_x0 "" set prevspan_x1 "" set prevspan_x1_adjusted "" if {$TESTBLOCK ne ""} { @@ -888,6 +1038,7 @@ namespace eval ::punk::pdf { } else { #set prevspan_x1_adjusted [lindex $spans_by_x0_adjusted $xidx-1 1] + set prevspan_x0 [lindex $spans_by_x0 $xidx-1 0] set prevspan_x1_adjusted [lindex $working_lines $block_linenum 3 $xidx-1 1] set prevspan_x1 [lindex $spans_by_x0 $xidx-1 1] #set effective_prev_x1 [expr {max($prevspan_x1,$prevspan_x1_adjusted)}] @@ -983,7 +1134,7 @@ namespace eval ::punk::pdf { if {$pageindex in $debug_pageblock_pages && $tbnum in $debug_pageblock_blocks} { set debuginfo "span x0 xx1 text: [lindex $xxt 0] [lindex $xxt 1] [punk::ansi::a normal]'[lindex $xxt 2]'\x1b\[m" if {$debug_highlight ne ""} { - set debuginfo "[punk::ansi::a underline {*}$debug_highlight]$debuginfo\x1b\[m" + set debuginfo "[punk::ansi::a overline {*}$debug_highlight]$debuginfo\x1b\[m" } puts $errchan $debuginfo if {$prevspan_x1 ne $prevspan_x1_adjusted} { @@ -992,9 +1143,9 @@ namespace eval ::punk::pdf { } else { set C "" ; set RST "" } - set debuginfo "prevspan_x1:$prevspan_x1 prevspan_x1_adjusted: $C$prevspan_x1_adjusted$RST effective_prev_x1: $effective_prev_x1" + set debuginfo "prev x0 x1 : $prevspan_x0 $prevspan_x1 prevspan_x1_adjusted: $C$prevspan_x1_adjusted$RST effective_prev_x1: $effective_prev_x1" if {$debug_highlight ne ""} { - set debuginfo "[punk::ansi::a overline {*}$debug_highlight]$debuginfo\x1b\[m" + set debuginfo "[punk::ansi::ansiwrap underline {*}$debug_highlight $debuginfo]" } puts $errchan $debuginfo } @@ -1014,7 +1165,9 @@ namespace eval ::punk::pdf { #set buffer [string repeat " " $start_char_pos] #set x_gap_chars 0 } else { - set x_gap [expr {$tx0 - $prevspan_x1}] ;#should be positive (but could it be negative, ie overlapped?) + set x_gap [expr {$tx0 - $prevspan_x1}] ;#usually positive but can easily be negative. e.g the word TAX in some fonts can have the left of the A slightly under the top bar of the T + #if the font is large - this negative offset could be more than an entire width of our output character size. + #we really need to know the source font to do this even close to properly - REVIEW set x_gap_chars [x_to_c $x_gap] ;#used to expand buffer below during collision detection. #set buffer [string repeat "^" $x_gap_chars] } @@ -1022,14 +1175,19 @@ namespace eval ::punk::pdf { #if {$xidx > 0} { # set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1] + $x_gap_chars}]] #} - - set text_points [c_to_x [string length $text]] - set tend [expr {$tx0 + $text_points}] - set remspace [expr {$tx1 - $tend}] - #set tailbuffer [string repeat "-" [x_to_c $remspace]] - set tailbuffer "" - - if {$xidx > 0 && $effective_prev_x1 ne "" && $tx0 <= $effective_prev_x1} { + if {$xidx > 0 && $x_gap >= -3 && $x_gap < 0} { + #treat small negative gap (overlap) in source (prev_x1 > tx0) (typically for character overhang/underhang) as adjacency + dict set page_seen_chunks $text 1 + set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] + lappend line_chunks "$buffer$text" + } elseif {$xidx > 0 && $x_gap >= 0 && $x_gap < ($base_x_per_c * 0.75)} { + #treat no, or small positive gap in source (prev_x1 < tx0) as intended adjacency + #.75 is yet another fudge for lack of font info - todo - rework to another engine? + #this branch could be merged with above, but kept separate for clarity and for specific comments. + dict set page_seen_chunks $text 1 + set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] + lappend line_chunks "$buffer$text" + } elseif {$xidx > 0 && $effective_prev_x1 ne "" && $tx0 <= $effective_prev_x1} { #possible overlap/overlay - left x of this data is before right x of previous #first use output font's x_per_c and see if only overlaying whitespace @@ -1075,6 +1233,8 @@ namespace eval ::punk::pdf { #set xcursor_prev [string length [punk::ansi::ansistrip [join [lrange $line_chunks 0 end-1] ""]]] #set buffer_prev [string repeat " " [expr {$startpos - $xcursor_prev}]] #lset line_chunks end "$buffer_prev[punk::ansi::a {*}$highlight_overwrites]$text[punk::ansi::a]" + + lset line_chunks end "$buffer[punk::ansi::a {*}$highlight_overwrites]$text[punk::ansi::a]" } #else: no need to overwrite same text if no ansi @@ -1110,7 +1270,7 @@ namespace eval ::punk::pdf { set expand [string repeat " " $expandchars] append buffer $expand # REVIEW - seems to work reasonably (e.g in FTP-1.1.1.pdf mentioned) - but we'll still output a warning - lappend line_chunks $buffer$text$tailbuffer + lappend line_chunks $buffer$text if {$char_overlap > 0 && $warnings_textblock >= 4} { puts $errchan "Warning pageindex [format %4s $pageindex] blockindex [format %4s $tbnum] blockwarning [format %4s $block_warning_count] type {text overlay}" @@ -1137,7 +1297,7 @@ namespace eval ::punk::pdf { if {$xidx > 0 && $x_gap_chars == 0} { set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] } - lappend line_chunks "$buffer$text$tailbuffer" + lappend line_chunks "$buffer$text" } } else { dict set page_seen_chunks $text 1 @@ -1147,7 +1307,7 @@ namespace eval ::punk::pdf { set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] } } - lappend line_chunks "$buffer$text$tailbuffer" + lappend line_chunks "$buffer$text" } #position 3 is spans_by_x0_adjusted set chunk [lindex $line_chunks end] @@ -1208,10 +1368,10 @@ namespace eval ::punk::pdf { puts $outchan "____________________________$bx1,$by1" } } - _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $warnings_textblock $outchan $errchan if {[llength $MERGE_BUFFER]} { - puts $errchan "WARNING: MERGE_BUFFER not empty, but should be empty at end of page" - puts $outchan [punk::lib::showdict -roottype list $MERGE_BUFFER] + _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $opt_shrink_textfree_blocks $opt_postcompact $warnings_textblock $outchan $errchan + #puts $errchan "WARNING: MERGE_BUFFER not empty, but should be empty at end of page" + #puts $outchan [punk::lib::showdict -roottype list $MERGE_BUFFER] } } @@ -1219,12 +1379,15 @@ namespace eval ::punk::pdf { puts $errchan "MuPDF wasrepaired: [$d wasrepaired]" puts $errchan "MuPDF warnings : [$d warnings]" } + flush $outchan + flush $errchan if {$opt_outchan eq "return" || $opt_errchan eq "return"} { #if both are 'return' at the same time - the same channel is used for both - flush $outchan - flush $errchan - set result [set results($callid)] + set result [encoding convertfrom utf-8 [set results($callid)]] + #set result [set results($callid)] + return $result } + return } } @@ -1264,6 +1427,43 @@ tcl::namespace::eval punk::pdf::lib { return $result } + #subset of possible boms - just for experimenting + proc bom_ident {binstr} { + if {[string match \uFFEF* $binstr]} { + return bom + } + set first32 [string range $binstr 0 3] + set bytes [binary scan $binstr H2H2H2H2 a b c d] + switch -- $a { + fe { + if {"$b" eq "ff"} { + return utf-16be + } + } + ef { + if {"$b$c" eq "bbbf"} { + return utf-8 + } + } + ff { + if {$b eq "fe"} { + if {"$c$d" eq "0000"} { + return utf-32le + } else { + return utf-16le + } + } + } + default { + if {"$a$b$c$d" eq "0000feff"} { + return utf-32be + } + } + } + return unknown + } + + #points to number of chars - in terms of fixed-width output font proc x_to_c {x} { upvar ::punk::pdf::base_x_per_c base_x_per_c @@ -1277,20 +1477,50 @@ tcl::namespace::eval punk::pdf::lib { return [expr {$charcount * $base_x_per_c}] } + proc merge_buffer_dimensions {bufname} { + upvar $bufname MB + if {![llength $MB]} { + return [list 0 0 0 0] ;#x0 y0 x1 y1 + } + set firstheader [lindex $MB 0 0] + set flds [dict get [lindex $firstheader 1] fields] + set dimensions [list [dict get $flds x0] [dict get $flds y0] [dict get $flds x1] [dict get $flds y1]] + #we can get negative values e.g image offset to left or top + foreach sametopblocks $MB { + foreach B $sametopblocks { + if {[lindex $B 0] eq "header"} { + set flds [dict get [lindex $B 1] fields] + set this_x0 [dict get $flds x0] + set this_y0 [dict get $flds y0] + set this_x1 [dict get $flds x1] + set this_y1 [dict get $flds y1] + lset dimensions 0 [expr {min([lindex $dimensions 0],$this_x0)}] + lset dimensions 1 [expr {min([lindex $dimensions 1],$this_y0)}] + lset dimensions 2 [expr {max([lindex $dimensions 2],$this_x1)}] + lset dimensions 3 [expr {max([lindex $dimensions 3],$this_y1)}] + } + } + } + return $dimensions + } #whether merging on or off - we keep a MERGE_BUFFER #At every y-index change, and at end of page, process last entry(s) - proc _process_merge_buffer {bufname pageindex blocksep_default opt_blocksep domerge opt_warnings_textblock outc errc} { + proc _process_merge_buffer {bufname pageindex blocksep_default opt_blocksep domerge opt_shrink_textfree_blocks opt_postcompact opt_warnings_textblock outc errc} { upvar ::punk::pdf::base_x_per_c base_x_per_c + upvar ::punk::pdf::testchan testchan set x_per_line [expr {$base_x_per_c * 2}] ;#hack for now upvar $bufname MB if {!$domerge} { set block_warnings 0 + #puts "MB:'$MB'" set warningrecords [lsearch -all -inline -index 0 $MB warnings] foreach wr $warningrecords { incr block_warnings [lindex $wr 1] } #just emit the header & line blocks we have in the merge buffer one after the other + set blockresult "" ;#needed for postcompact + set emissions [list] ;#need to enable blank sep_block even when -postcompact is true foreach yset $MB { foreach B $yset { switch -- [lindex $B 0] { @@ -1298,22 +1528,72 @@ tcl::namespace::eval punk::pdf::lib { if {$opt_blocksep ne ""} { set sep [dict get [lindex $B 1] sep] set sep [string map [list $block_warnings] $sep] - puts $outc $sep + #puts $outc $sep + lappend emissions [list data $blockresult] + lappend emissions [list sep $sep] + set blockresult "" + #append blockresult $sep \n } } lines { set lines [lindex $B 1] + #puts $errc "--->[punk::ansi::ansistring VIEW $lines]" + set idx -1 foreach l $lines { - puts $outc $l + incr idx + if {[catch { + #puts $outc $l + append blockresult $l \n + } errMsg]} { + set prevline [lindex $lines $idx-1] + set nextline [lindex $lines $idx+1] + puts stderr "error writing output channel $outc\nerrMsg:$errMsg\n" + puts stderr " prevline:'[punk::ansi::ansistring VIEW -lf 1 $prevline]'" + puts stderr " lineview '[punk::ansi::ansistring VIEW -lf 1 $l]'" + puts stderr " nextline:'[punk::ansi::ansistring VIEW -lf 1 $nextline]'" + #puts $testchan "=----------------------------------------=" + #puts $testchan $l + #puts $testchan "=----------------------------------------=" + } } } block { - puts $outc [string map [list \uFFFD " "] [lindex $B 1]] + if {!$opt_shrink_textfree_blocks} { + #puts $outc [string map [list \uFFFD " "] [lindex $B 1]] + append blockresult [string map [list \uFFFD " "] [lindex $B 1]] \n + #puts $errc "--->[punk::ansi::ansistring VIEW [lindex $B 1]]" + } } warnings {} } } } + lappend emissions [list data $blockresult] + + set output "" + if {$opt_postcompact} { + foreach e $emissions { + lassign $e etype data_or_header + switch -- $etype { + sep { + append output $data_or_header \n + } + data { + foreach ln [split $data_or_header \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + } + } + } + puts -nonewline $outc $output + } else { + foreach e $emissions { + lassign $e _ data_or_header + puts -nonewline $outc $data_or_header\n + } + } set MB [list] return } @@ -1322,38 +1602,71 @@ tcl::namespace::eval punk::pdf::lib { set headers_structure [list] set mergedblock "" set warnings [list] - #puts "MB: $MB" if {[llength $MB]} { - set rootheader [lindex $MB 0 0 1] - #puts $errc "rootheader: $rootheader" - set base_y [dict get $rootheader fields y0] + #rootheader doesn't 'contain' subsequent blocks in x or y direction - it's just the y0 that triggers the new merge because it's below previous blocks + #set rootheader [lindex $MB 0 0 1] + ##puts $errc "rootheader: $rootheader" + #set base_y [dict get $rootheader fields y0] + set mbdimensions [merge_buffer_dimensions MB] ;# x0 y0 x1 y1 + set base_y [lindex $mbdimensions 1] + } + #loop through and determine which blocks of those to be merged are RHS e.g largest x1 + set max_x1 0 + #todo - further refine to find blocks in the mergeset with no textblocks to right? + set rhs_block_indexes [list] + set textblock_x0_list [list] + foreach sametopblocks $MB { + foreach B $sametopblocks { + if {[lindex $B 0] eq "header"} { + set flds [dict get [lindex $B 1] fields] + #puts "flds: $flds" + if {[dict get $flds type] eq "text"} { + set this_x1 [dict get $flds x1] + lappend textblock_x0_list [dict get $flds x0] + if {abs($this_x1 - $max_x1) < 1} { + lappend rhs_block_indexes [dict get $flds blockindex] + set max_x1 [expr {max($this_x1,$max_x1)}] + } elseif {$this_x1 > $max_x1} { + set max_x1 $this_x1 + set rhs_block_indexes [list [dict get $flds blockindex]] + } + } + } + } } - foreach ybuf $MB { + foreach sametop $MB { set yheaders [list] set y_index "" - if {[llength $ybuf]} { - set firstheader [lindex $ybuf 0 1] - set blockrow_y [dict get $firstheader fields y0] - set blockrow_offset [expr {$blockrow_y - $base_y}] - set blockrow_x0 [dict get $firstheader fields x0] - set blockrow_x1 [dict get $firstheader fields x1] + if {[llength $sametop]} { + set firstheader [lindex $sametop 0 1] + set blockrow_y0 [dict get $firstheader fields y0] + set blockrow_y1 [dict get $firstheader fields y1] + set blockrow_top_offset [expr {$blockrow_y0 - $base_y}] + #set blockrow_x0 [dict get $firstheader fields x0] + #set blockrow_x1 [dict get $firstheader fields x1] if {$opt_warnings_textblock >= 9} { puts $errc "Warning type debug operation {merge blockrow} firstheader $firstheader" } } - set leftright_idx -1 - foreach B $ybuf { - incr leftright_idx - #if {$leftright_idx == 0} { - # set firstheader [lindex $B 1] - #} - #while {[llength $ybuf]} {} - #set B [lpop ybuf 0] + set header_idx -1 + #set leftright_max [expr {[llength $sametop] -1}] + set hdrs [lsearch -all -inline -index 0 $sametop header] + #puts "--- > $hdrs" + set header_max [expr {[llength $hdrs] -1}] + foreach B $sametop { #puts "====>$B" switch -- [lindex $B 0] { header { - set this_y_index [dict get [lindex $B 1] fields y-index] + incr header_idx ;#not necessarily in left-right order! + set this_y_index [dict get [lindex $B 1] fields y-index] + set this_x0 [dict get [lindex $B 1] fields x0] + set this_x1 [dict get [lindex $B 1] fields x1] + set this_y0 [dict get [lindex $B 1] fields y0] + set this_y1 [dict get [lindex $B 1] fields y1] + set this_top_offset [expr {$this_y0 - $base_y}] + set this_bottom_offset [expr {$this_y1 - $base_y}] + set this_block_index [dict get [lindex $B 1] fields blockindex] if {$y_index eq ""} { set y_index $this_y_index } else { @@ -1372,10 +1685,11 @@ tcl::namespace::eval punk::pdf::lib { set lines [lindex $B 1] #review - vertical oriented text at left can stomp on data during merge #we are only given the bbox info by tclMuPDF - if {abs($blockrow_x1 - $blockrow_x0) < 1} { + if {abs($this_x1 - $this_x0) < 1} { #non displayable #todo - emit at document tail as a non-layed out block? if {$opt_warnings_textblock > 0} { + #review - firstheader vs this_block_index? #puts $errc "Warning - no x space. Undisplayed text on page:[dict get $firstheader fields pageindex] block:[dict get $firstheader fields blockindex] text:[join $lines \n]" puts $errc "Warning pageindex [dict get $firstheader fields pageindex] blockindex [dict get $firstheader fields blockindex] type {no x space} msg {Undisplayed text} text \"[join $lines \n]\"" } @@ -1386,38 +1700,95 @@ tcl::namespace::eval punk::pdf::lib { #e.g overtype::textblock::ellipsis $space_available $block set block [join $lines \n] - set bwidth [textblock::widthtopline $block] ;#assume non-ragged block + set numlines [llength $lines] + set bwidth [textblock::width $block] ;#assume ragged block (differing line lengths) set margintop "" - if {$blockrow_offset > 0} { - set offsetlines [expr {int(ceil($blockrow_offset / $x_per_line))}] - set margintop [textblock::block $bwidth $offsetlines " "] + set bottom_offsetlines [expr {int(round($this_bottom_offset / $x_per_line))}] + if {[llength $MB] > 1 && $bottom_offsetlines >= $numlines} { + #can use bottom of textblock dist from top (better line alignment for scenario: "smallprint fieldname:" "bigprint value" aligned at base ???) + set topmarginlines [expr {$bottom_offsetlines - $numlines}] + #puts "====> offset from bottom bottom_offsetlines:$bottom_offsetlines topmarginlines:$topmarginlines this_y1:$this_y1" + if {$topmarginlines > 0} { + set margintop [textblock::block $bwidth $topmarginlines " "] + } + } else { + if {$this_top_offset > 0} { + #puts "====> offset from top" + #set offsetlines [expr {int(ceil($blockrow_top_offset / $x_per_line))}] + set offsetlines [expr {int(round($blockrow_top_offset / $x_per_line))}] ;# + #set offsetlines [expr {int(floor($this_top_offset / $x_per_line))}] ;# + if {$offsetlines > 0} { + set margintop [textblock::block $bwidth $offsetlines " "] + } + } + } + if {$this_block_index ni $rhs_block_indexes} { + #truncate overwidth blocks except for rhs + #overwidth if can collide (or merge) with x0 of any other textblocks in the merge set + #only interested in blocks where x0 >= this x1 + set next_x0_list [list] + foreach x0 $textblock_x0_list { + if {$x0 >= ($this_x1 - $base_x_per_c)} { + lappend next_x0_list $x0 + } + } + set next_x0_list [lsort -real $next_x0_list] + set closest_x0 [lindex $next_x0_list 0] + if {$closest_x0 ne ""} { + #set maxwidthchars [x_to_c $this_x1] + set maxwidthchars [x_to_c $closest_x0] + incr maxwidthchars -1 ;#vertically merging text blocks results in unintended text adjacencies - REVIEW + if {$maxwidthchars < $bwidth} { + #warn of rhs truncation + #text will have 'ellipsis' + #review - source text could already have ellipsis - this makes detecting truncation harder + #for now we will produce a warning if ellipsis is in the result and is on the right hand side + #todo - use a function other than textblock::frame which can report truncation + set block [textblock::frame -type {} -boxlimits {} -width [expr {$maxwidthchars +2}] $block] ;# +2 for vertical frame borders - which are removed but needed for block -width + set is_truncated 0 + foreach ln [split [punk::ansi::ansistrip $block] \n] { + if {[regexp {[\u2026]$} $ln]} { + set is_truncated 1 + break + } + } + if {$is_truncated} { + if {$opt_warnings_textblock > 0} { + puts $errc "Warning pageindex [dict get $firstheader fields pageindex] blockindex [dict get $firstheader fields blockindex] type {truncation} msg {merged textblock truncated on rhs} text \"[join $lines \n]\"" + } + lappend warnings 1 + } + } + } + } + #apply margintop after possible rhs truncation above (no need for ellipsis above content) + if {$margintop ne ""} { + set movedblock $margintop\n$block + } else { + set movedblock $block } + if {$mergedblock eq ""} { #could be image - as we are using overtype - should result in blank area #todo - consider option to set bg colour, then create a block of spaces of the right size - but don't add any chars other than ANSI codes which can be stripped easily. - if {$margintop ne ""} { - set mergedblock $margintop\n$block - } else { - set mergedblock $block - } + set mergedblock $movedblock } else { - if {$margintop ne ""} { - set movedblock $margintop\n$block - } else { - set movedblock $block - } set mergedblock [overtype::block -transparent 1 -overflow 1 $mergedblock $movedblock] } + } } block { set block [lindex $B 1] set bwidth [textblock::widthtopline $block] ;#assume non-ragged block set margintop "" - if {$blockrow_offset > 0} { - set offsetlines [expr {int(ceil($blockrow_offset / $x_per_line))}] - set margintop [textblock::block $bwidth $offsetlines "\uFFFD"] - #puts "=====> offset:$blockrow_offset" + if {$this_top_offset > 0} { + #set offsetlines [expr {int(ceil($this_top_offset / $x_per_line))}] + set offsetlines [expr {int(round($this_top_offset / $x_per_line))}] + if {$offsetlines > 0} { + set margintop [textblock::block $bwidth $offsetlines "\uFFFD"] + } + #puts "=====> offset:$this_top_offset" } if {$mergedblock eq ""} { if {$margintop ne ""} { @@ -1431,6 +1802,7 @@ tcl::namespace::eval punk::pdf::lib { } else { set movedblock $block } + #review - img placeholder can obscure text if above it - we don't have transparency info - so probably not what's wanted set mergedblock [overtype::block -transparent \uFFFD -overflow 1 $mergedblock $movedblock] } } @@ -1443,6 +1815,7 @@ tcl::namespace::eval punk::pdf::lib { lappend headers_structure $yheaders } if {[llength $all_headers] == 1} { + #puts stdout "---> [lindex $all_headers 0 1]" #only one block was in the MERGE_BUFFER set stored_filled_sep [dict get [lindex $all_headers 0] sep] if {$opt_blocksep ne ""} { @@ -1453,9 +1826,41 @@ tcl::namespace::eval punk::pdf::lib { } #all % placeholders filled, but isn't set sep [string map [list $block_warnings] $stored_filled_sep] + #still want to allow emitting empty line when opt_postcompact is true, if opt_blocksep is set to space puts $outc $sep } - puts $outc $mergedblock + #set is_text_free 1 ;#default to disprove + #set headerinfo [lindex $all_headers 0 1] + #set list_merges [dict get $headerinfo type] + #foreach blockrow_merge $list_merges { + # foreach tp $blockrow_merge { + # if {$tp eq "text"} { + # #todo - look for whitespace only and treat as text free + # set is_text_free 0 + # } + # } + #} + set blockresult "" + if {$opt_shrink_textfree_blocks} { + set teststripped [punk::ansi::ansistrip $mergedblock] + if {[string trim $teststripped] ne ""} { + set blockresult $mergedblock + } + } else { + set blockresult $mergedblock + } + set output "" + if {$opt_postcompact} { + foreach ln [split $blockresult \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + puts -nonewline $outc $output + } else { + puts -nonewline $outc $blockresult\n + } + } elseif {[llength $all_headers] > 1} { #more than one block was in the MERGE_BUFFER set merged_minx0 Inf @@ -1489,7 +1894,27 @@ tcl::namespace::eval punk::pdf::lib { dict set map %marker% "[punk::ansi::a bold cyan]MERGEDBLOCK[punk::ansi::a]" puts $outc [string map $map $opt_blocksep] } - puts $outc $mergedblock + if {$opt_shrink_textfree_blocks} { + set teststripped [punk::ansi::ansistrip $mergedblock] + if {[string trim $teststripped] ne ""} { + #puts $outc $mergedblock + set blockresult $mergedblock + } + } else { + #puts $outc $mergedblock + set blockresult $mergedblock + } + set output "" + if {$opt_postcompact} { + foreach ln [split $blockresult \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + puts -nonewline $outc $output + } else { + puts -nonewline $outc $blockresult\n + } } set MB [list] } diff --git a/src/project_layouts/custom/_project/punk.basic/src/make.tcl b/src/project_layouts/custom/_project/punk.basic/src/make.tcl index 8cb8153c..c3cbc9aa 100644 --- a/src/project_layouts/custom/_project/punk.basic/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.basic/src/make.tcl @@ -3075,7 +3075,7 @@ foreach vfstail $vfs_tails { #copy the version that is mounted in this runtime to vfsname.new if {[catch { file copy -force $building_runtime $buildfolder/$vfsname.new - catch {exec chmod +x $buildfolder/$vfsname.new} + catch {exec chmod +w $buildfolder/$vfsname.new} } errM]} { puts stderr "$kit_type 'file copy -force $building_runtime $buildfolder/$vfsname.new' failed\n$errM" error $errM diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/flagfilter-0.3.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/flagfilter-0.3.tm index 1d37e215..00f58e82 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/flagfilter-0.3.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/flagfilter-0.3.tm @@ -1538,6 +1538,7 @@ namespace eval flagfilter { } } + #todo - rename 'cprocessor' is misleading oo::class create cprocessor { variable o_runid variable o_name @@ -1577,7 +1578,9 @@ namespace eval flagfilter { if {[dict exists $o_pinfo match]} { set o_matchspec [dict get $o_pinfo match] } else { - set o_matchspec {^[^-^/].*} ;#match anything that isn't flaglike + #review - unix paths? conflict with windows style flag such as /w + #must accept empty string + set o_matchspec {^[^-^/].*|^$} ;#match anything that isn't flaglike } set o_found_match 0 set o_matched_argument "" ;#need o_found_match to differentiate match of empty string diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk-0.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk-0.1.tm index 6fb185a9..83dad2bf 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk-0.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk-0.1.tm @@ -6947,7 +6947,8 @@ namespace eval punk { set newrow {} foreach oldrow $list_rows { if {$j >= [llength $oldrow]} { - continue + #continue + lappend newrow "" } else { lappend newrow [lindex $oldrow $j] } @@ -6956,6 +6957,19 @@ namespace eval punk { } return $res } + proc transpose_equal_lists {list_rows} { + set columns [list] + set rowidx -1 + foreach l $list_rows { + set colidx -1 + incr rowidx + foreach val $l { + incr colidx + lset columns $colidx $rowidx $val + } + } + return $columns + } proc transpose_strings {list_of_strings} { set charlists [lmap v $list_of_strings {split $v ""}] set tchars [transpose_lists $charlists] diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm index 255715ad..ad2d58f4 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm @@ -3785,7 +3785,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -3892,7 +3892,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -4158,7 +4158,8 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu # where line and column are ascii codes whose values are +31 # vt52 can be entered/exited via escapes # This means we probably need to to wrap enter/exit vt52 and keep this state - as we don't have a standard way to query for terminal type - # (vt52 supports ESC Z - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + # (vt52 supports ESC Z (obs DECID) - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + #ESC\[c - is more modern equiv of DECID lappend PUNKARGS [list { @id -id ::punk::ansi::vt52move @@ -4946,6 +4947,8 @@ to 223 (=255 - 32) } if {[string length $text] < 2} {return $text} set parts [punk::ansi::ta::split_codes $text] + #review - if we have only one element of a paired codeset such as PM,SOS - it will not be found by split_codes + #The output technically then still contains ansi (which may for example be hidden by terminal despite lack of closing ST) if {[llength $parts] == 1} {return [lindex $parts 0]} set out "" #todo - try: join [lsearch -stride 2 -index 0 -subindices -all -inline $parts *] "" @@ -5686,21 +5689,106 @@ tcl::namespace::eval punk::ansi { #[list_end] [comment {--- end definitions namespace punk::ansi::codetype ---}] } tcl::namespace::eval sequence_type { - proc is_Fe {code} { + #first byte after ESC identifies code type + #NOTE - we are looking for valid start of a single sequence here + #- not whether it is complete or where it ends, unless it's a fixed number of bytes + + #\u0020-\u002F + # ESC !"#$%&'()*+,-./ + + #\u0030-\u003F + #ESC 0-9:;<=>? + + #\u0040-\u005F + # ESC @A-Z[\]^ + + #\u0060-\u007E + + proc is_Fe7 {code} { # C1 control codes - if {[regexp {^\033\[[\u0040-\u005F]}]} { - #7bit - typical case - return 1 - } + #7bit - typical case + # ESC @A-Z[\]^ + return [regexp {^\033[\u0040-\u005F]} $code] + } + proc is_Fe {code} { + #although Fe7 more common - we'll put the simpler regex for 8 first + return [expr {[is_Fe8 $code] || [is_Fe7 $code]}] + } + proc is_Fe8 {code} { #8bit - #review - all C1 escapes ? 0x80-0x90F + #review - all C1 escapes ? 0x80-0x9F #This is possibly problematic as it is affected by encoding. #According to https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit #"However, in character encodings used on modern devices such as UTF-8 or CP-1252, those codes are often used for other purposes, so only the 2-byte sequence is typically used." - return 0 + return [regexp {^[\u0080-\u09F]} $code] + } + #ESC 0-9,:,;,<,=,>,? + proc is_Fp {code} { + #single byte following ESC + return [regexp {^\033[\u0030-\u003F]$} $code] } + + #https://en.wikipedia.org/wiki/ISO/IEC_2022 + #e.g + # ESC a (INT) interrupts the current process + # ESC c (RIS) reset terminal to initial state + #ESC `a-z{|}~ proc is_Fs {code} { - puts stderr "is_Fs unimplemented" + #single byte following ESC + return [regexp {^\033[\u0060-\u007E]$} $code] + } + + + proc is_nF {code} { + #2 bytes + #subcategorised by the low two bits of the first byte (n) + #further by whether the final byte is in \u0030-u003f (p) or not (t) + return [regexp {^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + + #review - test + #3Fp - private use + #e.g vt100 + # ESC#3 DECDHL double-height letters top half + # ESC#4 DECDHL double-height letters bottom half + # ESC#5 DECSWL single-width line + # ESC#6 DECDWL double-width line + proc is_3Fp {code} { + return [regexp {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} $code] ;#check regexp + } + + proc is_code7 {code} { + #Fe | Fs | Fp | nF | Fe + return [regexp {^\033[\u0040-\u005F]|^\033[\u0060-\u007e]$|^\033[\u0030-\u003F]$|^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + proc is_code8 {code} { + return [regexp {^[\u0080-\u09F]} $code] + } + proc is_code {code} { + return [expr {[is_code8 $code] || [is_code7 $code]}] + } + + proc classify {code} { + return [switch -regexp -- $code { + {^\033[\u0030-\u003F]$} {string cat Fp} + {^[\u0080-\u009F]|^\033[\u0040-\u005F]} {string cat Fe} + {^\033[\u0060-\u007E]$} {string cat Fs} + {^\033[\u0020-\u002F]+[\u0030-\u007E]$} { + #nF sequences + set firstbytenum [scan [string index $code 1] %c] + set lastbyte [string index $code end] + + set n [expr {$firstbytenum & 3}] + if {[regexp {[\u0030-\u003F]} $lastbyte]} { + set tp p + } else { + set tp t + } + string cat ${n}F$tp + } + {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} {string cat 3Fp} + default {string cat unknown} + }] } } # -- --- --- --- --- --- --- --- --- --- --- @@ -5718,6 +5806,7 @@ tcl::namespace::eval punk::ansi::ta { #[para] https://github.com/perlancar/perl-Text-ANSI-Util/blob/master/lib/Text/ANSI/BaseUtil.pm #[list_begin definitions] tcl::namespace::path ::punk::ansi + namespace export detect detect_in_list detect_sgr extract length split_codes split_at_codes split_codes_single variable PUNKARGS @@ -5748,20 +5837,27 @@ tcl::namespace::eval punk::ansi::ta { variable re_osc_open {(?:\x1b\]|\u009d).*} + #review - distinguishing standalone codes vs those that are paired with contents considered part of the code + #e.g PM,SOS are 'paired' ended by ST + #variable standalone_code_map [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> "" \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] variable re_standalones {(?:\x1bc|\x1b7|\x1b8|\x1bM|\x1bD|\x1bE|\x1bH|\x1b=|\x1b>|\x1b#3|\x1b#4|\x1b#5|\x1b#6|\x1b#8)} variable re_standalones_vt52 {(?:\x1bZ)} - #ESC Y move, ESC b foreground colour + # -- + #ESC Y move - \x1bY ie 2 bytes to close + #ESC b foreground colour - \x1bb 1 byte to close + variable re_vt52_incomplete {(?:\x1bY(.){0,1}$|\x1bb$)} + #\x1bc vt52 bgcolour conflict ? #ESC F - gr-on ESC G - gr-off - variable re_vt52_open {(?:\x1bY|\x1bb|\x1bF)} - #\x1bc vt52 bgcolour conflict ?? + # -- #if we don't split on altgraphics too and separate them out - it's easy to get into a horrible mess variable re_g0_group {(?:\x1b\(0)(?:(?!\x1b\(B).)*\x1b\(B} variable re_g0_open {(?:\x1b\(0)} variable re_g0_close {(?:\x1b\(B)} + #detect start of ansicode that is closed by ST # DCS "ESC P" or "0x90" is also terminated by ST set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} #ST terminators [list \007 \033\\ \u009c] @@ -5777,12 +5873,15 @@ tcl::namespace::eval punk::ansi::ta { variable re_ST {(?:\x1bP|\x1bX|\x1b\^|\x1b_)(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007)|(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c)} + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_vt52_incomplete}" #consider standalones as self-opening/self-closing - therefore included in both ansi_detect and ansi_detect_open #default for regexes is non-newline-sensitive matching - ie matches can span lines # -- --- --- --- - variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" + #variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" # -- --- --- --- #handrafted TRIE version of above. Somewhat difficult to construct and maintain. TODO - find a regexp TRIE generator that works with Tcl regexes #This does make things quicker - but it's too early to finalise the detect/split regexes (e.g missing \U0090 ) - will need to be redone. @@ -5802,20 +5901,24 @@ tcl::namespace::eval punk::ansi::ta { # plus more for auxiliary keypad codes in keypad application mode (and some in numeric mode) #regexp expanded syntax = ?x + #full detect - checking for closing sequences variable re_ansi_detect {(?x) - (?:\x1b(?:\[(?:[\x20-\x2f\x30-\x3f]*[\x40-\x7e])|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) + (?:\x1b(?:\[(?:[\x20-\x3f]*[\x40-\x7e])|a|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) |(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c) - |(?:\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e] + |(?:\u009b)[\x20-\x3f]*[\x40-\x7e] |(?:\u009d)(?:[^\u009c]*)?\u009c } #--- + #todo + #variable re_ansi_detectcode $re_ansi_detect + #variable re_ansi_detectcode {\x1b[\u0040-\u005F]|\x1b[\u0060-\u007e]|\x1b[\u0030-\u003F]|\x1b[\u0020-\u002F]+[\u0030-\u007E]} + variable re_ansi_detectcode {(?:\x1b(?:[\u0030-\u007E]|[\u0020-\u002F]+[\u0030-\u007E]))|[\u0090-\u009F]} # -- --- --- --- #variable re_csi_code {(?:\x1b\[|\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e]} - variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" #may be same as detect - kept in case detect needs to diverge #variable re_ansi_split "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" @@ -5827,20 +5930,6 @@ tcl::namespace::eval punk::ansi::ta { set re_ansi_split_multi "(?:${re_ansi_split})+" } - lappend PUNKARGS [list -dynamic 0 { - @id -id ::punk::ansi::ta::detect - @cmd -name punk::ansi::ta::detect -help\ - "Return a boolean indicating whether Ansi codes were detected in text. - Important caveat: - When text is a tcl list made from splitting (or lappending) some ansi string - - individual elements may be braced or have certain chars escaped. - (one example is if a list element contains an unbalanced brace) - This can cause square brackets that form part of the ansi to be backslash escaped - - and the function can fail to match it as an Ansi code. - " - @values -min 1 - text -type string - } ] #*** !doctools #[call [fun detect] [arg text]] @@ -5851,10 +5940,35 @@ tcl::namespace::eval punk::ansi::ta { #detect any ansi escapes #review - only detect 'complete' codes - or just use the opening escapes for performance? + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::detect + @cmd -name punk::ansi::ta::detect\ + -summary\ + "Test if text has completed ANSI codes"\ + -help\ + "Return a boolean indicating whether *complete* Ansi codes were detected in text. + + By complete, it means that paired squences such as PM (privacy message) must be + closed. It also means that a truncated sequence such as \\x1b\\\[ or a lone escape + will not be detected as ANSI. + + Use punk::ansi::ta::detectcode as a slightly faster detector for ANSI codes, that does + not require paired sequences to have both starting and end sequences to be detected. + + Important caveat: + When text is a tcl list made from splitting (or lappending) some ansi string + - individual elements may be braced or have certain chars escaped. + (one example is if a list element contains an unbalanced brace) + This can cause square brackets that form part of the ansi to be backslash escaped + - and the function can fail to match it as an Ansi code. + " + @values -min 1 + text -type string -help\ + "Block of text. See caveat above about lists." + } ] proc detect {text} [string map [list [list $re_ansi_detect]] { regexp $text }] - #can be used on dicts - but will check keys too. keys could also contain ansi and have escapes proc detect_in_list {list} { #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) @@ -5865,6 +5979,22 @@ tcl::namespace::eval punk::ansi::ta { } return 0 } + + #will detect for example lone opening or closing PM + proc detectcode {text} [string map [list [list $re_ansi_detectcode]] { + regexp $text + }] + proc detectcode_in_list {list} { + #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) + foreach item $list { + if {[detectcode $item]} { + return 1 + } + } + return 0 + } + + #surprisingly - the ::join operation can be (relatively) slow on lists containing ansi proc detect_in_list2 {list} { detect [join $list " "] @@ -5915,6 +6045,8 @@ tcl::namespace::eval punk::ansi::ta { variable re_sgr expr {[regexp $re_sgr $text]} } + + #perl: ta_strip proc strip {text} { #*** !doctools #[call [fun strip] [arg text]] @@ -5922,6 +6054,39 @@ tcl::namespace::eval punk::ansi::ta { #[para]This is a tailcall to punk::ansi::ansistrip tailcall ansistrip $text } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::extract + @cmd -name punk::ansi::ta::extract\ + -summary\ + "Return only the ANSI codes in text"\ + -help\ + "This is the opposite of strip, + returning only the ANSI codes in text." + @values -min 1 -max 1 + text -type string + } ] + proc extract {text} { + set parts [split_codes $text] + set out "" + foreach {pt code} $parts { + append out $code + } + return $out + } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::length + @cmd -name punk::ansi::ta::length\ + -summary\ + "Calculate length of text (excluding the ANSI codes)"\ + -help\ + "Calculate length of text (excluding the ANSI codes) + This is not the printing length of the string on screen." + @values -min 1 + text -type string + } ] + #perl: ta_length proc length {text} { #*** !doctools #[call [fun length] [arg text]] @@ -5936,6 +6101,8 @@ tcl::namespace::eval punk::ansi::ta { # #} + #perl: ta_trunc + #truncate $text to $width columns while still including all the ANSI colour codes. proc trunc {text width args} { } @@ -8504,6 +8671,10 @@ tcl::namespace::eval punk::ansi::internal { } } +tcl::namespace::eval punk::ansi { + namespace import ::punk::ansi::ta::detect +} + #inserting into global namespace like this should be kept to a minimum.. but this is considered a core aspect of punk::ansi #todo - document interp alias {} ansistring {} ::punk::ansi::ansistring diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm index fc438d57..a6224c0d 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/args-0.2.tm @@ -1364,6 +1364,11 @@ tcl::namespace::eval punk::args { #} tcl::dict::set tmp_optspec_defaults -type $v } + -defaultdisplaytype { + #how the -default is displayed + #-default doesn't have to be the same type as -type which validates user input that is not defaulted. + tcl::dict::set tmp_optspec_defaults -defaultdisplaytype $v + } -parsekey { tcl::dict::set tmp_optspec_defaults -parsekey $v @@ -1441,7 +1446,7 @@ tcl::namespace::eval punk::args { default { set known { -parsekey -group -grouphelp\ -any -anyopts -arbitrary -form -minsize -maxsize -choices -choicegroups -choicemultiple -choicecolumns -choicelabels -choiceinfo\ - -type -range -typeranges -default -typedefaults + -type -range -typeranges -default -defaultdisplaytype -typedefaults -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted -nocase\ -nominsize -nomaxsize -norange -nochoices -nochoicelabels\ -type -optional -allow_ansi -validate_ansistripped -strip_ansi -multiple -prefix\ @@ -2052,6 +2057,9 @@ tcl::namespace::eval punk::args { tcl::dict::set spec_merged -optional 1 } } + -defaultdisplaytype { + tcl::dict::set spec_merged -defaultdisplaytype $specval + } -typedefaults { set typecount [llength [tcl::dict::get $spec_merged -type]] if {$typecount != [llength $specval]} { @@ -2114,7 +2122,7 @@ tcl::namespace::eval punk::args { -form -type\ -parsekey -group\ -range -typeranges\ - -default -typedefaults\ + -default -defaultdisplaytype -typedefaults\ -minsize -maxsize -choices -choicegroups\ -choicemultiple -choicecolumns -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted\ -nocase -optional -multiple -validate_ansistripped -allow_ansi -strip_ansi -help\ @@ -3784,7 +3792,32 @@ tcl::namespace::eval punk::args { } if {[dict exists $arginfo -default]} { - set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + #default isn't necessarily of same type as -type required for validation + #Guessing at the type from the data is likely to be unsatisfactory. + + set defaultdisplaytype [Dict_getdef $arginfo -defaultdisplaytype string] + switch -- $defaultdisplaytype { + dict { + #single level + set rawdefault [dict get $arginfo -default] + set default "{\n" + dict for {k v} $rawdefault { + append default " \"$k\" \"$v\"\n" + } + append default "}" + } + list { + set default "{\n" + foreach v $rawdefault { + append default " \"$v\"\n" + } + append default "}" + } + default { + #set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + set default "'[dict get $arginfo -default]'" + } + } } else { set default "" } diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/char-0.1.0.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/char-0.1.0.tm index f8123b94..69df08b9 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/char-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/char-0.1.0.tm @@ -89,6 +89,107 @@ tcl::namespace::eval punk::char { variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? + + #more useful for referring to ANSI documentation would be a proper 7-bit and 8-bit 'Code Table' layout + #as described in ECMA-35 5.2 + # where the positions of the table are in one-to-one correspondence with the bit combinations of the code. + #- for 7-bit: 8 columns 16 rows + #- for 8-bit 16 columns 16 rows + proc codetable {which} { + set bits 8 + switch -- $which { + ascii8 { + set which default + } + ascii { + set bits 7 + } + default { + if {$which ni [encoding names]} { + error "codetable unsupported - use 'ascii' or an entry from the result of the 'encoding names' command." + } + } + } + package require punk::ansi + + set hibit_count [expr {$bits-4}] + set bitcolumns [expr {2**$hibit_count}] ;#always 4 bits for the rows - remaining bits for the columns + set columncount [expr {$bitcolumns + 6}] + + + #set header1 [list "" "b7\nb6\nb5" "0\n0\n0" "0\n0\n1" "0\n1\n0" "0\n1\n1" "1\n0\n0" "1\n0\n1" "1\n1\n0" "1\n1\n1"] + set header1 [list] + set hibits_label "" + set indent "" + for {set hb $bits} {$hb > 4} {incr hb -1} { + append hibits_label ${indent}b$hb\n + append indent " " + } + set hibits_label [string range $hibits_label 0 end-1] + lappend header1 $hibits_label "" "" "" "" "" + + for {set colidx 0} {$colidx < $bitcolumns} {incr colidx} { + set binval [format %0${hibit_count}b $colidx] + set binvalbits [split $binval ""] + set indent "" + set display_hibits "" + foreach bb $binvalbits { + append display_hibits $indent$bb\n + append indent " " + } + set display_hibits [string range $display_hibits 0 end-1] + lappend header1 $display_hibits + } + #\u2193 down arrow + #right-down arrows + #\u2ba7 + #\u21b4 + #\u2b0e + set header2 [list " Bits" b4 b3 b2 b1 "column \u2192\nrow \u2b0e" {*}[punk::lib::range 0 $bitcolumns-1]] + set headers [list $header1 $header2] + + #set t [textblock::table -return tableobject -rows $rows] + set t [textblock::table -return tableobject] + #todo - fix textblock::table to allow configure -columncount + for {set c 0} {$c < $columncount} {incr c} { + $t add_column + } + + set vheaders [punk::transpose_equal_lists $headers] + set hidx -1 + foreach vh $vheaders { + incr hidx + $t configure_column $hidx -headers $vh + } + $t configure_header 0 -colspans [list 6 0 0 0 0 0 {*}[lrepeat $bitcolumns 1]] + $t configure_column 0 -blockalign left + + #always 16 rows - remaining bits form the columns + for {set ridx 0} {$ridx <= 15} {incr ridx} { + set charlist [list] + set lowbits [format %04b $ridx] + for {set i 0} {$i < $bitcolumns} {incr i} { + set hibits [format %0${hibit_count}b $i] + set ch [format %c [scan ${hibits}${lowbits} %b]] + #puts "-->${hibits}${lowbits} ch:$ch" + if {$which ne "default"} { + if {[catch {encoding convertfrom $which $ch} ch]} { + set ch [punk::ansi::a red bold]-[punk::ansi::a] + lappend charlist $ch + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } + set r [list "" {*}[split $lowbits ""] $ridx {*}$charlist] + $t add_row $r + } + puts stderr $t + $t print + } + #just the 7-bit ascii. use [page ascii] for the 8-bit layout proc ascii {} {return { 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/console-0.1.1.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/console-0.1.1.tm index 4322ceaa..ea8d3f77 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/console-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/console-0.1.1.tm @@ -1336,6 +1336,7 @@ namespace eval punk::console { #https://vt100.net/docs/vt510-rm/DA1.html # proc get_device_attributes {{inoutchannels {stdin stdout}}} { + #Note the vt52 rough equivalen \x1bZ - commonly supported but probably best considered obsolete as it collides with ECMA 48 SCI Single Character Introducer #DA1 variable last_da1_result #first element in result is the terminal's architectural class 61,62,63,64.. ? diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm index 6ce76618..46cd5668 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm @@ -2242,6 +2242,121 @@ namespace eval punk::lib { } } + + #review - compare to IMAP4 methods of specifying ranges? + punk::args::define { + @id -id ::punk::lib::indexset_resolve + @cmd -name punk::lib::indexset_resolve\ + -summary\ + "Resolve an indexset to a list of integers based on supplied list or string length."\ + -help\ + "Resolve an 'indexset' to a list of actual indices within the range of the provided numitems value. + An indexset consists of a comma delimited list of indexes or index-ranges. + The indexes are 0-based. + Ranges must be specified with .. as the separator. + Whitespace is ignored. + Each index (or endpoint of an index-range) can be of the forms accepted by Tcl list or string commands, + e.g end-2 or 2+2. + + end means the last page. + end-1 means the second last page. + 0.. is the same as 0..end. + examples: + 1,3.. + output the page index 1 (2nd page) followed by all from index 3 to the end. + 0-2,end + output the first 3 pages, and the last page. + end-1..0 + output the indexes in reverse order from 2nd last page to first page." + @values -min 2 -max 2 + numitems -type integer + indexset -type string + } + proc indexset_resolve {numitems indexset} { + if {![string is integer -strict $numitems] || ![regexp {^[\-\+_end,\.0-9]*$} $indexset]} { + #use parser on unhappy path only + set errmsg [punk::args::usage -scheme error ::punk::lib::indexset_resolve] + uplevel 1 [list return -code error -errorcode {TCL WRONGARGS PUNK} $errmsg] + } + set index_list [list] ;#list of actual indexes within the range + set iparts [split $indexset ,] + set index_list [list] + foreach ipart $iparts { + set ipart [string trim $ipart] + set rposn [string first .. $ipart] + if {$rposn>=0} { + #range + lassign [punk::lib::string_splitbefore_indices $ipart $rposn $rposn+2] rawa _ rawb + set rawa [string trim $rawa] + set rawb [string trim $rawb] + if {$rawa eq ""} {set rawa 0} + set a [punk::lib::lindex_resolve $numitems $rawa] + if {$a == -3} { + #undershot - leave negative + } elseif {$a == -2 && $rawa ne "-2"} { + #overshot + set a [expr {$numitems}] ;#put it outside the range on the upper side + } + + if {$rawb eq ""} { + if {$a > $numitems-1} { + set rawb $a ;#make sure .. doesn't return last item - should return nothing + } else { + set rawb end + } + } + set b [punk::lib::lindex_resolve $numitems $rawb] + if {$b == -3} { + #undershot - leave negative + } elseif {$b == -2 && $rawb ne "-2"} { + set b [expr {$numitems}] ;#overshot - put it outside the range on the upper side + } + + #e.g make sure .. doesn't return last item - should return nothing as both are above the range. + if {$a >= 0 && $a <= $numitems-1 && $b >=0 && $b <= $numitems-1} { + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + if {$a >= 0 && $a <= $numitems-1} { + #only a is in the range + if {$b < 0} { + set b 0 + } else { + set b [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$b >=0 && $b <= $numitems-1} { + #only b is in the range + if {$a < 0} { + set a 0 + } else { + set a [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + #both outside the range + if {$a < 0 && $b > 0} { + #spans the range in forward order + set a 0 + set b [expr {$numitems-1}] + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$a > 0 && $b < 0} { + #spans the range in reverse order + set a [expr {$numitems-1}] + set b 0 + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } + #both outside of range on same side + } + } + } else { + set idx [punk::lib::lindex_resolve_basic $numitems $ipart] + if {$idx >= 0} { + lappend index_list $idx + } + } + } + return $index_list + } # showdict uses lindex_resolve results -2 & -3 to determine whether index is out of bounds on upper vs lower side #REVIEW: This shouldn't really need the list itself - just the length would suffice punk::args::define { @@ -2305,7 +2420,8 @@ namespace eval punk::lib { #<0 ? error "lindex_resolve len must be an integer" } - set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + set index [tcl::string::map {_ {}} $index] ;#basic forward compatibility with integers such as 1_000 for 8.6 + #todo - be stricter about malformations such as 1000_ if {[string is integer -strict $index]} { #can match +i -i if {$index < 0} { @@ -3345,8 +3461,12 @@ namespace eval punk::lib { #NOTE: running ta::detect on a list (or dict) as a whole can be problematic if items in the list have backslash escapes due to Tcl list quoting and escaping behaviour. #This commonly happens if there is an unbalanced brace (which is a normal occurrence and needs to be handled) #ta::detect on a list of ansi-containing string may appear to work for some simple inputs but is not reliable - #detect_in_list will check at first level. (not intended for detecting ansi in deeper structures) - if {![punk::ansi::ta::detect_in_list $linelist]} { + #detect_in_list/detectcode_in_list will check at first level. (not intended for detecting ansi in deeper structures) + + #we use detectcode_in_list instead of detect_in_list + #detectcode_in_list will detect unclosed (or unopened) paired sequences such as PM (privacy message) + # - but the main reason is it is slightly faster. + if {![punk::ansi::ta::detectcode_in_list $linelist]} { if {$opt_ansiresets} { foreach ln $linelist { lappend transformed $RST$ln$RST diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm index f8e55b02..10fda84e 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm @@ -13,6 +13,9 @@ # @@ Meta End +#BUGS +# 2025-08 +# n// and n/// won't output info about 'namespace path' if there are no commands in the namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements @@ -1235,7 +1238,7 @@ tcl::namespace::eval punk::ns { if {[dict size [dict get $nsdict namespacepath]]} { set path_text "" if {!$opt_nspathcommands} { - append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]]" + append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]] (use n/// to display)" } else { append path_text \n " Also resolving cmds in namespace paths:" set nspathdict [dict get $nsdict namespacepath] @@ -1247,8 +1250,17 @@ tcl::namespace::eval punk::ns { } } else { #todo - change to display in column order to be same as main command listing + set parentcommands [dict get $nsdict commands] dict for {k v} $nspathdict { - set pathcommands [dict get $v commands] + set rawpathcommands [dict get $v commands] + set pathcommands [list] + foreach c $rawpathcommands { + if {$c in $parentcommands} { + lappend pathcommands [punk::ansi::a strike]$c[a] + } else { + lappend pathcommands $c + } + } set columns 6 if {[llength $pathcommands] < 6} { set columns [llength $v] diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/shellfilter-0.2.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/shellfilter-0.2.tm index e04e5107..8017d3f5 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/shellfilter-0.2.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/shellfilter-0.2.tm @@ -579,13 +579,15 @@ namespace eval shellfilter::chan { #punk::ansi::ansistrip converts at least some of the box drawing G0 chars to unicode - todo - more complete conversion #assumes line-buffering. a more advanced filter required if ansicodes can arrive split across separate read or write operations! oo::class create ansistrip { - variable o_trecord + variable o_trecord variable o_enc + variable o_encbuf variable o_is_junction constructor {tf} { package require punk::ansi set o_trecord $tf set o_enc [dict get $tf -encoding] + set o_encbuf "" if {[dict exists $tf -junction]} { set o_is_junction [dict get $tf -junction] } else { @@ -614,10 +616,36 @@ namespace eval shellfilter::chan { method flush {transform_handle} { return "" } + #method write {transform_handle bytes} { + # #broken due to occasional unexpected byte sequence + # set instring [encoding convertfrom $o_enc $bytes] + # set outstring [punk::ansi::ansistrip $instring] + # return [encoding convertto $o_enc $outstring] + #} method write {transform_handle bytes} { - set instring [encoding convertfrom $o_enc $bytes] - set outstring [punk::ansi::ansistrip $instring] - return [encoding convertto $o_enc $outstring] + #set instring [tcl::encoding::convertfrom $o_enc $bytes] ;naive approach will break due to unexpected byte sequence - occasionally + #bytes can break at arbitrary points making encoding conversions invalid. + + set inputbytes $o_encbuf$bytes + set o_encbuf "" + set tail_offset 0 + while {$tail_offset < [string length $inputbytes] && [catch {tcl::encoding::convertfrom $o_enc [string range $inputbytes 0 end-$tail_offset]} stringdata]} { + incr tail_offset + } + if {$tail_offset > 0} { + if {$tail_offset < [string length $inputbytes]} { + #stringdata from catch statement must be a valid result + set t [expr {$tail_offset - 1}] + set o_encbuf [string range $inputbytes end-$t end] + } else { + set stringdata "" + set o_encbuf $inputbytes + return "" + } + } + + set outstring [punk::ansi::ansistrip $stringdata] + return [tcl::encoding::convertto $o_enc $outstring] } method meta_is_redirection {} { return $o_is_junction @@ -724,6 +752,8 @@ namespace eval shellfilter::chan { set emit "" if {[string last \x1b $buf] >= 0} { #detect will detect ansi SGR and gron groff and other codes + #REVIEW - ta::detect won't detect SOS without paired ST for things like PM + # ta::detectcode will - but then split_codes_single will treat unpaired SOS as text? if {[punk::ansi::ta::detect $buf]} { #split_codes_single regex faster than split_codes - but more resulting parts #'single' refers to number of escapes - but can still contain e.g multiple SGR codes (or mode set operations etc) @@ -1035,7 +1065,7 @@ namespace eval shellfilter::chan { } #todo - something oo::class create rebuffer { - variable o_trecord + variable o_trecord variable o_enc constructor {tf} { set o_trecord $tf diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm index de0164cd..97969463 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/bootsupport/modules/textblock-0.1.3.tm @@ -2528,7 +2528,7 @@ tcl::namespace::eval textblock { set ansiborder_final $ansiborder_body_col_row #$c will always have ansi resets due to overtype behaviour ? #todo - review overtype - if {[punk::ansi::ta::detect $c]} { + if {[punk::ansi::ta::detectcode $c]} { #use only the last ansi sequence in the cell value #Filter out foreground and use background for ansiborder override set parts [punk::ansi::ta::split_codes_single $c] @@ -4377,6 +4377,7 @@ tcl::namespace::eval textblock { if {![punk::ansi::ta::detect $block]} { return $block } + #could be newine in SOS/PM? review - terminal should theoretically ignore all chars til close of string foreach ln [split $block \n] { set parts [punk::ansi::ta::split_codes $ln] if {[lindex $parts 0] eq ""} { @@ -4894,7 +4895,9 @@ tcl::namespace::eval textblock { } set textblock [textutil::tabify::untabify2 $textblock $tw] } - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { #ansistripraw slightly faster than ansistrip - and won't affect width (avoid detect_g0/conversions) set textblock [punk::ansi::ansistripraw $textblock] } @@ -4917,7 +4920,9 @@ tcl::namespace::eval textblock { } else { set tl $textblock } - if {[punk::ansi::ta::detect $tl]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[punk::ansi::ta::detectcode $tl]} { set tl [punk::ansi::ansistripraw $tl] } return [punk::char::ansifreestring_width $tl] @@ -4964,7 +4969,7 @@ tcl::namespace::eval textblock { set textblock [textutil::tabify::untabify2 $textblock $tw] } #ansistripraw on entire block in one go rather than line by line - result should be the same - review - make tests - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { set textblock [punk::ansi::ansistripraw $textblock] } if {[tcl::string::last \n $textblock] >= 0} { @@ -5211,7 +5216,7 @@ tcl::namespace::eval textblock { set known_hasansi [tcl::dict::get $opts -known_hasansi] if {$known_hasansi eq ""} { - set block_has_ansi [punk::ansi::ta::detect $block] + set block_has_ansi [punk::ansi::ta::detectcode $block] } else { set block_has_ansi $known_hasansi } @@ -5648,7 +5653,7 @@ tcl::namespace::eval textblock { set rowcount 0 set blocklists [list] foreach b $blocks { - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set bl [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] } else { @@ -5676,7 +5681,7 @@ tcl::namespace::eval textblock { set i -1 foreach b $args { incr i - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set blines [punk::lib::lines_as_list -ansireplays 1 -ansiresets auto -- $b] } else { @@ -5799,7 +5804,7 @@ tcl::namespace::eval textblock { #testing of any decent size block line by line - even with max_string_length_line is slower than ta::detect anyway. #blocks passed to join can be ragged - so we can't pass -known_samewidth to pad - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { # - we need to join to use pad - even though we then need to immediately resplit REVIEW (make line list version of pad?) set replay_block [::join [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] \n] #set blines [split [textblock::pad $replay_block -known_hasansi 1 -known_samewidth $is_samewidth -known_blockwidth $bwidth -width $bwidth -which right -padchar " "] \n] @@ -8650,7 +8655,7 @@ tcl::namespace::eval textblock { set cache_inner [overtype::block -blockalign $opt_blockalign -ellipsis $opt_ellipsis $underlay $cache_contentpattern] #puts "frame--->ansiwrap -rawansi [ansistring VIEW $opt_ansibase] $cache_inner" if {$opt_ansibase ne ""} { - if {[punk::ansi::ta::detect $cache_inner]} { + if {[punk::ansi::ta::detectcode $cache_inner]} { #set cache_inner [punk::ansi::ansiwrap -rawansi $opt_ansibase $cache_inner] #set cache_inner [punk::ansi::ansiwrap_raw $opt_ansibase "" "" $cache_inner] #jjj ??? review @@ -8745,7 +8750,7 @@ tcl::namespace::eval textblock { #set cwidth [textblock::width $contents] #JJJ - set contents_has_ansi [punk::ansi::ta::detect $contents] + set contents_has_ansi [punk::ansi::ta::detectcode $contents] if {$opt_ansibase ne ""} { if {$contents_has_ansi} { #set contents [punk::ansi::ansiwrap -rawansi $opt_ansibase $contents] @@ -8799,8 +8804,9 @@ tcl::namespace::eval textblock { if {$subposn >= 0} { set content_line [lindex $clines $contentindex] #review - different forms of reset e.g \x1b\[m ?? - if {[string range $content_line 0 3] eq "\x1b\[0m"} { + if {[tcl::string::range $content_line 0 3] eq "\x1b\[0m"} { set content_line [tcl::string::range $content_line 4 end] + #::tcl::string::replace content_line 0 3 } append content_line $opt_ansibase append fs [tcl::string::replace $tline $subposn $subposn+$pattern_offset $content_line] \n diff --git a/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl b/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl index 8cb8153c..c3cbc9aa 100644 --- a/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.project-0.1/src/make.tcl @@ -3075,7 +3075,7 @@ foreach vfstail $vfs_tails { #copy the version that is mounted in this runtime to vfsname.new if {[catch { file copy -force $building_runtime $buildfolder/$vfsname.new - catch {exec chmod +x $buildfolder/$vfsname.new} + catch {exec chmod +w $buildfolder/$vfsname.new} } errM]} { puts stderr "$kit_type 'file copy -force $building_runtime $buildfolder/$vfsname.new' failed\n$errM" error $errM diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/flagfilter-0.3.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/flagfilter-0.3.tm index 1d37e215..00f58e82 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/flagfilter-0.3.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/flagfilter-0.3.tm @@ -1538,6 +1538,7 @@ namespace eval flagfilter { } } + #todo - rename 'cprocessor' is misleading oo::class create cprocessor { variable o_runid variable o_name @@ -1577,7 +1578,9 @@ namespace eval flagfilter { if {[dict exists $o_pinfo match]} { set o_matchspec [dict get $o_pinfo match] } else { - set o_matchspec {^[^-^/].*} ;#match anything that isn't flaglike + #review - unix paths? conflict with windows style flag such as /w + #must accept empty string + set o_matchspec {^[^-^/].*|^$} ;#match anything that isn't flaglike } set o_found_match 0 set o_matched_argument "" ;#need o_found_match to differentiate match of empty string diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk-0.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk-0.1.tm index 6fb185a9..83dad2bf 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk-0.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk-0.1.tm @@ -6947,7 +6947,8 @@ namespace eval punk { set newrow {} foreach oldrow $list_rows { if {$j >= [llength $oldrow]} { - continue + #continue + lappend newrow "" } else { lappend newrow [lindex $oldrow $j] } @@ -6956,6 +6957,19 @@ namespace eval punk { } return $res } + proc transpose_equal_lists {list_rows} { + set columns [list] + set rowidx -1 + foreach l $list_rows { + set colidx -1 + incr rowidx + foreach val $l { + incr colidx + lset columns $colidx $rowidx $val + } + } + return $columns + } proc transpose_strings {list_of_strings} { set charlists [lmap v $list_of_strings {split $v ""}] set tchars [transpose_lists $charlists] diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm index 255715ad..ad2d58f4 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ansi-0.1.1.tm @@ -3785,7 +3785,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -3892,7 +3892,7 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu } set codestack [list] - if {[punk::ansi::ta::detect $text]} { + if {[punk::ansi::ta::detectcode $text]} { set emit "" #set parts [punk::ansi::ta::split_codes $text] set parts [punk::ansi::ta::split_codes_single $text] @@ -4158,7 +4158,8 @@ Brightblack 100 Brightred 101 Brightgreen 102 Brightyellow 103 Brightblu # where line and column are ascii codes whose values are +31 # vt52 can be entered/exited via escapes # This means we probably need to to wrap enter/exit vt52 and keep this state - as we don't have a standard way to query for terminal type - # (vt52 supports ESC Z - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + # (vt52 supports ESC Z (obs DECID) - but vt100 sometimes? doesn't - and querying at each output would be slow anyway, even if there was a common query :/ ) + #ESC\[c - is more modern equiv of DECID lappend PUNKARGS [list { @id -id ::punk::ansi::vt52move @@ -4946,6 +4947,8 @@ to 223 (=255 - 32) } if {[string length $text] < 2} {return $text} set parts [punk::ansi::ta::split_codes $text] + #review - if we have only one element of a paired codeset such as PM,SOS - it will not be found by split_codes + #The output technically then still contains ansi (which may for example be hidden by terminal despite lack of closing ST) if {[llength $parts] == 1} {return [lindex $parts 0]} set out "" #todo - try: join [lsearch -stride 2 -index 0 -subindices -all -inline $parts *] "" @@ -5686,21 +5689,106 @@ tcl::namespace::eval punk::ansi { #[list_end] [comment {--- end definitions namespace punk::ansi::codetype ---}] } tcl::namespace::eval sequence_type { - proc is_Fe {code} { + #first byte after ESC identifies code type + #NOTE - we are looking for valid start of a single sequence here + #- not whether it is complete or where it ends, unless it's a fixed number of bytes + + #\u0020-\u002F + # ESC !"#$%&'()*+,-./ + + #\u0030-\u003F + #ESC 0-9:;<=>? + + #\u0040-\u005F + # ESC @A-Z[\]^ + + #\u0060-\u007E + + proc is_Fe7 {code} { # C1 control codes - if {[regexp {^\033\[[\u0040-\u005F]}]} { - #7bit - typical case - return 1 - } + #7bit - typical case + # ESC @A-Z[\]^ + return [regexp {^\033[\u0040-\u005F]} $code] + } + proc is_Fe {code} { + #although Fe7 more common - we'll put the simpler regex for 8 first + return [expr {[is_Fe8 $code] || [is_Fe7 $code]}] + } + proc is_Fe8 {code} { #8bit - #review - all C1 escapes ? 0x80-0x90F + #review - all C1 escapes ? 0x80-0x9F #This is possibly problematic as it is affected by encoding. #According to https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit #"However, in character encodings used on modern devices such as UTF-8 or CP-1252, those codes are often used for other purposes, so only the 2-byte sequence is typically used." - return 0 + return [regexp {^[\u0080-\u09F]} $code] + } + #ESC 0-9,:,;,<,=,>,? + proc is_Fp {code} { + #single byte following ESC + return [regexp {^\033[\u0030-\u003F]$} $code] } + + #https://en.wikipedia.org/wiki/ISO/IEC_2022 + #e.g + # ESC a (INT) interrupts the current process + # ESC c (RIS) reset terminal to initial state + #ESC `a-z{|}~ proc is_Fs {code} { - puts stderr "is_Fs unimplemented" + #single byte following ESC + return [regexp {^\033[\u0060-\u007E]$} $code] + } + + + proc is_nF {code} { + #2 bytes + #subcategorised by the low two bits of the first byte (n) + #further by whether the final byte is in \u0030-u003f (p) or not (t) + return [regexp {^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + + #review - test + #3Fp - private use + #e.g vt100 + # ESC#3 DECDHL double-height letters top half + # ESC#4 DECDHL double-height letters bottom half + # ESC#5 DECSWL single-width line + # ESC#6 DECDWL double-width line + proc is_3Fp {code} { + return [regexp {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} $code] ;#check regexp + } + + proc is_code7 {code} { + #Fe | Fs | Fp | nF | Fe + return [regexp {^\033[\u0040-\u005F]|^\033[\u0060-\u007e]$|^\033[\u0030-\u003F]$|^\033[\u0020-\u002F]+[\u0030-\u007E]$} $code] + } + proc is_code8 {code} { + return [regexp {^[\u0080-\u09F]} $code] + } + proc is_code {code} { + return [expr {[is_code8 $code] || [is_code7 $code]}] + } + + proc classify {code} { + return [switch -regexp -- $code { + {^\033[\u0030-\u003F]$} {string cat Fp} + {^[\u0080-\u009F]|^\033[\u0040-\u005F]} {string cat Fe} + {^\033[\u0060-\u007E]$} {string cat Fs} + {^\033[\u0020-\u002F]+[\u0030-\u007E]$} { + #nF sequences + set firstbytenum [scan [string index $code 1] %c] + set lastbyte [string index $code end] + + set n [expr {$firstbytenum & 3}] + if {[regexp {[\u0030-\u003F]} $lastbyte]} { + set tp p + } else { + set tp t + } + string cat ${n}F$tp + } + {^\033#[\u0020-\u002F]*[\u0030-\u003F]$} {string cat 3Fp} + default {string cat unknown} + }] } } # -- --- --- --- --- --- --- --- --- --- --- @@ -5718,6 +5806,7 @@ tcl::namespace::eval punk::ansi::ta { #[para] https://github.com/perlancar/perl-Text-ANSI-Util/blob/master/lib/Text/ANSI/BaseUtil.pm #[list_begin definitions] tcl::namespace::path ::punk::ansi + namespace export detect detect_in_list detect_sgr extract length split_codes split_at_codes split_codes_single variable PUNKARGS @@ -5748,20 +5837,27 @@ tcl::namespace::eval punk::ansi::ta { variable re_osc_open {(?:\x1b\]|\u009d).*} + #review - distinguishing standalone codes vs those that are paired with contents considered part of the code + #e.g PM,SOS are 'paired' ended by ST + #variable standalone_code_map [list \x1bc "" \x1b7 "" \x1b8 "" \x1bM "" \x1bE "" \x1bD "" \x1bH "" \x1b= "" \x1b> "" \x1b#3 "" \x1b#4 "" \x1b#5 "" \x1b#6 "" \x1b#8 ""] variable re_standalones {(?:\x1bc|\x1b7|\x1b8|\x1bM|\x1bD|\x1bE|\x1bH|\x1b=|\x1b>|\x1b#3|\x1b#4|\x1b#5|\x1b#6|\x1b#8)} variable re_standalones_vt52 {(?:\x1bZ)} - #ESC Y move, ESC b foreground colour + # -- + #ESC Y move - \x1bY ie 2 bytes to close + #ESC b foreground colour - \x1bb 1 byte to close + variable re_vt52_incomplete {(?:\x1bY(.){0,1}$|\x1bb$)} + #\x1bc vt52 bgcolour conflict ? #ESC F - gr-on ESC G - gr-off - variable re_vt52_open {(?:\x1bY|\x1bb|\x1bF)} - #\x1bc vt52 bgcolour conflict ?? + # -- #if we don't split on altgraphics too and separate them out - it's easy to get into a horrible mess variable re_g0_group {(?:\x1b\(0)(?:(?!\x1b\(B).)*\x1b\(B} variable re_g0_open {(?:\x1b\(0)} variable re_g0_close {(?:\x1b\(B)} + #detect start of ansicode that is closed by ST # DCS "ESC P" or "0x90" is also terminated by ST set re_ST_open {(?:\033P|\u0090|\033X|\u0098|\033\^|\u009e|\033_|\u009f)} #ST terminators [list \007 \033\\ \u009c] @@ -5777,12 +5873,15 @@ tcl::namespace::eval punk::ansi::ta { variable re_ST {(?:\x1bP|\x1bX|\x1b\^|\x1b_)(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007)|(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c)} + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + #variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" + variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_ST_open}|${re_vt52_incomplete}" #consider standalones as self-opening/self-closing - therefore included in both ansi_detect and ansi_detect_open #default for regexes is non-newline-sensitive matching - ie matches can span lines # -- --- --- --- - variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" + #variable re_ansi_detect1 "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" # -- --- --- --- #handrafted TRIE version of above. Somewhat difficult to construct and maintain. TODO - find a regexp TRIE generator that works with Tcl regexes #This does make things quicker - but it's too early to finalise the detect/split regexes (e.g missing \U0090 ) - will need to be redone. @@ -5802,20 +5901,24 @@ tcl::namespace::eval punk::ansi::ta { # plus more for auxiliary keypad codes in keypad application mode (and some in numeric mode) #regexp expanded syntax = ?x + #full detect - checking for closing sequences variable re_ansi_detect {(?x) - (?:\x1b(?:\[(?:[\x20-\x2f\x30-\x3f]*[\x40-\x7e])|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) + (?:\x1b(?:\[(?:[\x20-\x3f]*[\x40-\x7e])|a|c|7|8|M|D|E|H|=|>|<|A|B|C|I|J|K|L|M|Z|(?:Y(?:..))|(?:b(?:.))|\((?:0|B)|\](?:(?:[^\007]*)\007|(?:(?!\x1b\\).)*\x1b\\)|(?:P|X|\^|_)(?:(?:(?!\x1b\\|\007).)*(?:\x1b\\|\007))|(?:\#(?:3|4|5|6|8)))) |(?:\u0090|\u0098|\u009E|\u009F)(?:(?!\u009c).)*(?:\u009c) - |(?:\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e] + |(?:\u009b)[\x20-\x3f]*[\x40-\x7e] |(?:\u009d)(?:[^\u009c]*)?\u009c } #--- + #todo + #variable re_ansi_detectcode $re_ansi_detect + #variable re_ansi_detectcode {\x1b[\u0040-\u005F]|\x1b[\u0060-\u007e]|\x1b[\u0030-\u003F]|\x1b[\u0020-\u002F]+[\u0030-\u007E]} + variable re_ansi_detectcode {(?:\x1b(?:[\u0030-\u007E]|[\u0020-\u002F]+[\u0030-\u007E]))|[\u0090-\u009F]} # -- --- --- --- #variable re_csi_code {(?:\x1b\[|\u009b)[\x20-\x2f\x30-\x3f]*[\x40-\x7e]} - variable re_ansi_detect_open "${re_csi_open}|${re_osc_open}|${re_standalones}|${re_standalones_vt52}|${re_ST_open}|${re_g0_open}|${re_vt52_open}" #may be same as detect - kept in case detect needs to diverge #variable re_ansi_split "${re_csi_code}|${re_esc_osc1}|${re_esc_osc2}|${re_esc_osc3}|${re_standalones}|${re_ST}|${re_g0_open}|${re_g0_close}" @@ -5827,20 +5930,6 @@ tcl::namespace::eval punk::ansi::ta { set re_ansi_split_multi "(?:${re_ansi_split})+" } - lappend PUNKARGS [list -dynamic 0 { - @id -id ::punk::ansi::ta::detect - @cmd -name punk::ansi::ta::detect -help\ - "Return a boolean indicating whether Ansi codes were detected in text. - Important caveat: - When text is a tcl list made from splitting (or lappending) some ansi string - - individual elements may be braced or have certain chars escaped. - (one example is if a list element contains an unbalanced brace) - This can cause square brackets that form part of the ansi to be backslash escaped - - and the function can fail to match it as an Ansi code. - " - @values -min 1 - text -type string - } ] #*** !doctools #[call [fun detect] [arg text]] @@ -5851,10 +5940,35 @@ tcl::namespace::eval punk::ansi::ta { #detect any ansi escapes #review - only detect 'complete' codes - or just use the opening escapes for performance? + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::detect + @cmd -name punk::ansi::ta::detect\ + -summary\ + "Test if text has completed ANSI codes"\ + -help\ + "Return a boolean indicating whether *complete* Ansi codes were detected in text. + + By complete, it means that paired squences such as PM (privacy message) must be + closed. It also means that a truncated sequence such as \\x1b\\\[ or a lone escape + will not be detected as ANSI. + + Use punk::ansi::ta::detectcode as a slightly faster detector for ANSI codes, that does + not require paired sequences to have both starting and end sequences to be detected. + + Important caveat: + When text is a tcl list made from splitting (or lappending) some ansi string + - individual elements may be braced or have certain chars escaped. + (one example is if a list element contains an unbalanced brace) + This can cause square brackets that form part of the ansi to be backslash escaped + - and the function can fail to match it as an Ansi code. + " + @values -min 1 + text -type string -help\ + "Block of text. See caveat above about lists." + } ] proc detect {text} [string map [list [list $re_ansi_detect]] { regexp $text }] - #can be used on dicts - but will check keys too. keys could also contain ansi and have escapes proc detect_in_list {list} { #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) @@ -5865,6 +5979,22 @@ tcl::namespace::eval punk::ansi::ta { } return 0 } + + #will detect for example lone opening or closing PM + proc detectcode {text} [string map [list [list $re_ansi_detectcode]] { + regexp $text + }] + proc detectcode_in_list {list} { + #loop is commonly faster than using join. (certain ansi codes triggering list quoting? review) + foreach item $list { + if {[detectcode $item]} { + return 1 + } + } + return 0 + } + + #surprisingly - the ::join operation can be (relatively) slow on lists containing ansi proc detect_in_list2 {list} { detect [join $list " "] @@ -5915,6 +6045,8 @@ tcl::namespace::eval punk::ansi::ta { variable re_sgr expr {[regexp $re_sgr $text]} } + + #perl: ta_strip proc strip {text} { #*** !doctools #[call [fun strip] [arg text]] @@ -5922,6 +6054,39 @@ tcl::namespace::eval punk::ansi::ta { #[para]This is a tailcall to punk::ansi::ansistrip tailcall ansistrip $text } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::extract + @cmd -name punk::ansi::ta::extract\ + -summary\ + "Return only the ANSI codes in text"\ + -help\ + "This is the opposite of strip, + returning only the ANSI codes in text." + @values -min 1 -max 1 + text -type string + } ] + proc extract {text} { + set parts [split_codes $text] + set out "" + foreach {pt code} $parts { + append out $code + } + return $out + } + + lappend PUNKARGS [list { + @id -id ::punk::ansi::ta::length + @cmd -name punk::ansi::ta::length\ + -summary\ + "Calculate length of text (excluding the ANSI codes)"\ + -help\ + "Calculate length of text (excluding the ANSI codes) + This is not the printing length of the string on screen." + @values -min 1 + text -type string + } ] + #perl: ta_length proc length {text} { #*** !doctools #[call [fun length] [arg text]] @@ -5936,6 +6101,8 @@ tcl::namespace::eval punk::ansi::ta { # #} + #perl: ta_trunc + #truncate $text to $width columns while still including all the ANSI colour codes. proc trunc {text width args} { } @@ -8504,6 +8671,10 @@ tcl::namespace::eval punk::ansi::internal { } } +tcl::namespace::eval punk::ansi { + namespace import ::punk::ansi::ta::detect +} + #inserting into global namespace like this should be kept to a minimum.. but this is considered a core aspect of punk::ansi #todo - document interp alias {} ansistring {} ::punk::ansi::ansistring diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm index fc438d57..a6224c0d 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/args-0.2.tm @@ -1364,6 +1364,11 @@ tcl::namespace::eval punk::args { #} tcl::dict::set tmp_optspec_defaults -type $v } + -defaultdisplaytype { + #how the -default is displayed + #-default doesn't have to be the same type as -type which validates user input that is not defaulted. + tcl::dict::set tmp_optspec_defaults -defaultdisplaytype $v + } -parsekey { tcl::dict::set tmp_optspec_defaults -parsekey $v @@ -1441,7 +1446,7 @@ tcl::namespace::eval punk::args { default { set known { -parsekey -group -grouphelp\ -any -anyopts -arbitrary -form -minsize -maxsize -choices -choicegroups -choicemultiple -choicecolumns -choicelabels -choiceinfo\ - -type -range -typeranges -default -typedefaults + -type -range -typeranges -default -defaultdisplaytype -typedefaults -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted -nocase\ -nominsize -nomaxsize -norange -nochoices -nochoicelabels\ -type -optional -allow_ansi -validate_ansistripped -strip_ansi -multiple -prefix\ @@ -2052,6 +2057,9 @@ tcl::namespace::eval punk::args { tcl::dict::set spec_merged -optional 1 } } + -defaultdisplaytype { + tcl::dict::set spec_merged -defaultdisplaytype $specval + } -typedefaults { set typecount [llength [tcl::dict::get $spec_merged -type]] if {$typecount != [llength $specval]} { @@ -2114,7 +2122,7 @@ tcl::namespace::eval punk::args { -form -type\ -parsekey -group\ -range -typeranges\ - -default -typedefaults\ + -default -defaultdisplaytype -typedefaults\ -minsize -maxsize -choices -choicegroups\ -choicemultiple -choicecolumns -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted\ -nocase -optional -multiple -validate_ansistripped -allow_ansi -strip_ansi -help\ @@ -3784,7 +3792,32 @@ tcl::namespace::eval punk::args { } if {[dict exists $arginfo -default]} { - set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + #default isn't necessarily of same type as -type required for validation + #Guessing at the type from the data is likely to be unsatisfactory. + + set defaultdisplaytype [Dict_getdef $arginfo -defaultdisplaytype string] + switch -- $defaultdisplaytype { + dict { + #single level + set rawdefault [dict get $arginfo -default] + set default "{\n" + dict for {k v} $rawdefault { + append default " \"$k\" \"$v\"\n" + } + append default "}" + } + list { + set default "{\n" + foreach v $rawdefault { + append default " \"$v\"\n" + } + append default "}" + } + default { + #set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + set default "'[dict get $arginfo -default]'" + } + } } else { set default "" } diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/char-0.1.0.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/char-0.1.0.tm index f8123b94..69df08b9 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/char-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/char-0.1.0.tm @@ -89,6 +89,107 @@ tcl::namespace::eval punk::char { variable invalid "???" ;# ideally this would be 0xFFFD - which should display as black diamond/rhombus with question mark. As at 2023 - this tends to display indistinguishably from other missing glyph boxes - so we use a longer sequence we can detect by length and to display obviously variable invalid_display_char \u25ab; #todo - change to 0xFFFD once diamond glyph more common? + + #more useful for referring to ANSI documentation would be a proper 7-bit and 8-bit 'Code Table' layout + #as described in ECMA-35 5.2 + # where the positions of the table are in one-to-one correspondence with the bit combinations of the code. + #- for 7-bit: 8 columns 16 rows + #- for 8-bit 16 columns 16 rows + proc codetable {which} { + set bits 8 + switch -- $which { + ascii8 { + set which default + } + ascii { + set bits 7 + } + default { + if {$which ni [encoding names]} { + error "codetable unsupported - use 'ascii' or an entry from the result of the 'encoding names' command." + } + } + } + package require punk::ansi + + set hibit_count [expr {$bits-4}] + set bitcolumns [expr {2**$hibit_count}] ;#always 4 bits for the rows - remaining bits for the columns + set columncount [expr {$bitcolumns + 6}] + + + #set header1 [list "" "b7\nb6\nb5" "0\n0\n0" "0\n0\n1" "0\n1\n0" "0\n1\n1" "1\n0\n0" "1\n0\n1" "1\n1\n0" "1\n1\n1"] + set header1 [list] + set hibits_label "" + set indent "" + for {set hb $bits} {$hb > 4} {incr hb -1} { + append hibits_label ${indent}b$hb\n + append indent " " + } + set hibits_label [string range $hibits_label 0 end-1] + lappend header1 $hibits_label "" "" "" "" "" + + for {set colidx 0} {$colidx < $bitcolumns} {incr colidx} { + set binval [format %0${hibit_count}b $colidx] + set binvalbits [split $binval ""] + set indent "" + set display_hibits "" + foreach bb $binvalbits { + append display_hibits $indent$bb\n + append indent " " + } + set display_hibits [string range $display_hibits 0 end-1] + lappend header1 $display_hibits + } + #\u2193 down arrow + #right-down arrows + #\u2ba7 + #\u21b4 + #\u2b0e + set header2 [list " Bits" b4 b3 b2 b1 "column \u2192\nrow \u2b0e" {*}[punk::lib::range 0 $bitcolumns-1]] + set headers [list $header1 $header2] + + #set t [textblock::table -return tableobject -rows $rows] + set t [textblock::table -return tableobject] + #todo - fix textblock::table to allow configure -columncount + for {set c 0} {$c < $columncount} {incr c} { + $t add_column + } + + set vheaders [punk::transpose_equal_lists $headers] + set hidx -1 + foreach vh $vheaders { + incr hidx + $t configure_column $hidx -headers $vh + } + $t configure_header 0 -colspans [list 6 0 0 0 0 0 {*}[lrepeat $bitcolumns 1]] + $t configure_column 0 -blockalign left + + #always 16 rows - remaining bits form the columns + for {set ridx 0} {$ridx <= 15} {incr ridx} { + set charlist [list] + set lowbits [format %04b $ridx] + for {set i 0} {$i < $bitcolumns} {incr i} { + set hibits [format %0${hibit_count}b $i] + set ch [format %c [scan ${hibits}${lowbits} %b]] + #puts "-->${hibits}${lowbits} ch:$ch" + if {$which ne "default"} { + if {[catch {encoding convertfrom $which $ch} ch]} { + set ch [punk::ansi::a red bold]-[punk::ansi::a] + lappend charlist $ch + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } else { + lappend charlist [punk::ansi::ansistring VIEW -lf 1 -vt 1 $ch] + } + } + set r [list "" {*}[split $lowbits ""] $ridx {*}$charlist] + $t add_row $r + } + puts stderr $t + $t print + } + #just the 7-bit ascii. use [page ascii] for the 8-bit layout proc ascii {} {return { 00 NUL 01 SOH 02 STX 03 ETX 04 EOT 05 ENQ 06 ACK 07 BEL diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/console-0.1.1.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/console-0.1.1.tm index 4322ceaa..ea8d3f77 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/console-0.1.1.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/console-0.1.1.tm @@ -1336,6 +1336,7 @@ namespace eval punk::console { #https://vt100.net/docs/vt510-rm/DA1.html # proc get_device_attributes {{inoutchannels {stdin stdout}}} { + #Note the vt52 rough equivalen \x1bZ - commonly supported but probably best considered obsolete as it collides with ECMA 48 SCI Single Character Introducer #DA1 variable last_da1_result #first element in result is the terminal's architectural class 61,62,63,64.. ? diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm index 6ce76618..46cd5668 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/lib-0.1.2.tm @@ -2242,6 +2242,121 @@ namespace eval punk::lib { } } + + #review - compare to IMAP4 methods of specifying ranges? + punk::args::define { + @id -id ::punk::lib::indexset_resolve + @cmd -name punk::lib::indexset_resolve\ + -summary\ + "Resolve an indexset to a list of integers based on supplied list or string length."\ + -help\ + "Resolve an 'indexset' to a list of actual indices within the range of the provided numitems value. + An indexset consists of a comma delimited list of indexes or index-ranges. + The indexes are 0-based. + Ranges must be specified with .. as the separator. + Whitespace is ignored. + Each index (or endpoint of an index-range) can be of the forms accepted by Tcl list or string commands, + e.g end-2 or 2+2. + + end means the last page. + end-1 means the second last page. + 0.. is the same as 0..end. + examples: + 1,3.. + output the page index 1 (2nd page) followed by all from index 3 to the end. + 0-2,end + output the first 3 pages, and the last page. + end-1..0 + output the indexes in reverse order from 2nd last page to first page." + @values -min 2 -max 2 + numitems -type integer + indexset -type string + } + proc indexset_resolve {numitems indexset} { + if {![string is integer -strict $numitems] || ![regexp {^[\-\+_end,\.0-9]*$} $indexset]} { + #use parser on unhappy path only + set errmsg [punk::args::usage -scheme error ::punk::lib::indexset_resolve] + uplevel 1 [list return -code error -errorcode {TCL WRONGARGS PUNK} $errmsg] + } + set index_list [list] ;#list of actual indexes within the range + set iparts [split $indexset ,] + set index_list [list] + foreach ipart $iparts { + set ipart [string trim $ipart] + set rposn [string first .. $ipart] + if {$rposn>=0} { + #range + lassign [punk::lib::string_splitbefore_indices $ipart $rposn $rposn+2] rawa _ rawb + set rawa [string trim $rawa] + set rawb [string trim $rawb] + if {$rawa eq ""} {set rawa 0} + set a [punk::lib::lindex_resolve $numitems $rawa] + if {$a == -3} { + #undershot - leave negative + } elseif {$a == -2 && $rawa ne "-2"} { + #overshot + set a [expr {$numitems}] ;#put it outside the range on the upper side + } + + if {$rawb eq ""} { + if {$a > $numitems-1} { + set rawb $a ;#make sure .. doesn't return last item - should return nothing + } else { + set rawb end + } + } + set b [punk::lib::lindex_resolve $numitems $rawb] + if {$b == -3} { + #undershot - leave negative + } elseif {$b == -2 && $rawb ne "-2"} { + set b [expr {$numitems}] ;#overshot - put it outside the range on the upper side + } + + #e.g make sure .. doesn't return last item - should return nothing as both are above the range. + if {$a >= 0 && $a <= $numitems-1 && $b >=0 && $b <= $numitems-1} { + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + if {$a >= 0 && $a <= $numitems-1} { + #only a is in the range + if {$b < 0} { + set b 0 + } else { + set b [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$b >=0 && $b <= $numitems-1} { + #only b is in the range + if {$a < 0} { + set a 0 + } else { + set a [expr {$numitems-1}] + } + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } else { + #both outside the range + if {$a < 0 && $b > 0} { + #spans the range in forward order + set a 0 + set b [expr {$numitems-1}] + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } elseif {$a > 0 && $b < 0} { + #spans the range in reverse order + set a [expr {$numitems-1}] + set b 0 + lappend index_list {*}[punk::lib::range $a $b] ;#required for tcl8.6, on tcl9 this will call lseq internally. + } + #both outside of range on same side + } + } + } else { + set idx [punk::lib::lindex_resolve_basic $numitems $ipart] + if {$idx >= 0} { + lappend index_list $idx + } + } + } + return $index_list + } # showdict uses lindex_resolve results -2 & -3 to determine whether index is out of bounds on upper vs lower side #REVIEW: This shouldn't really need the list itself - just the length would suffice punk::args::define { @@ -2305,7 +2420,8 @@ namespace eval punk::lib { #<0 ? error "lindex_resolve len must be an integer" } - set index [tcl::string::map {_ {}} $index] ;#forward compatibility with integers such as 1_000 + set index [tcl::string::map {_ {}} $index] ;#basic forward compatibility with integers such as 1_000 for 8.6 + #todo - be stricter about malformations such as 1000_ if {[string is integer -strict $index]} { #can match +i -i if {$index < 0} { @@ -3345,8 +3461,12 @@ namespace eval punk::lib { #NOTE: running ta::detect on a list (or dict) as a whole can be problematic if items in the list have backslash escapes due to Tcl list quoting and escaping behaviour. #This commonly happens if there is an unbalanced brace (which is a normal occurrence and needs to be handled) #ta::detect on a list of ansi-containing string may appear to work for some simple inputs but is not reliable - #detect_in_list will check at first level. (not intended for detecting ansi in deeper structures) - if {![punk::ansi::ta::detect_in_list $linelist]} { + #detect_in_list/detectcode_in_list will check at first level. (not intended for detecting ansi in deeper structures) + + #we use detectcode_in_list instead of detect_in_list + #detectcode_in_list will detect unclosed (or unopened) paired sequences such as PM (privacy message) + # - but the main reason is it is slightly faster. + if {![punk::ansi::ta::detectcode_in_list $linelist]} { if {$opt_ansiresets} { foreach ln $linelist { lappend transformed $RST$ln$RST diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm index f8e55b02..10fda84e 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/punk/ns-0.1.0.tm @@ -13,6 +13,9 @@ # @@ Meta End +#BUGS +# 2025-08 +# n// and n/// won't output info about 'namespace path' if there are no commands in the namespace # ++ +++ +++ +++ +++ +++ +++ +++ +++ +++ +++ ## Requirements @@ -1235,7 +1238,7 @@ tcl::namespace::eval punk::ns { if {[dict size [dict get $nsdict namespacepath]]} { set path_text "" if {!$opt_nspathcommands} { - append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]]" + append path_text \n " Also resolving cmds in namespace paths: [dict keys [dict get $nsdict namespacepath]] (use n/// to display)" } else { append path_text \n " Also resolving cmds in namespace paths:" set nspathdict [dict get $nsdict namespacepath] @@ -1247,8 +1250,17 @@ tcl::namespace::eval punk::ns { } } else { #todo - change to display in column order to be same as main command listing + set parentcommands [dict get $nsdict commands] dict for {k v} $nspathdict { - set pathcommands [dict get $v commands] + set rawpathcommands [dict get $v commands] + set pathcommands [list] + foreach c $rawpathcommands { + if {$c in $parentcommands} { + lappend pathcommands [punk::ansi::a strike]$c[a] + } else { + lappend pathcommands $c + } + } set columns 6 if {[llength $pathcommands] < 6} { set columns [llength $v] diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/shellfilter-0.2.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/shellfilter-0.2.tm index e04e5107..8017d3f5 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/shellfilter-0.2.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/shellfilter-0.2.tm @@ -579,13 +579,15 @@ namespace eval shellfilter::chan { #punk::ansi::ansistrip converts at least some of the box drawing G0 chars to unicode - todo - more complete conversion #assumes line-buffering. a more advanced filter required if ansicodes can arrive split across separate read or write operations! oo::class create ansistrip { - variable o_trecord + variable o_trecord variable o_enc + variable o_encbuf variable o_is_junction constructor {tf} { package require punk::ansi set o_trecord $tf set o_enc [dict get $tf -encoding] + set o_encbuf "" if {[dict exists $tf -junction]} { set o_is_junction [dict get $tf -junction] } else { @@ -614,10 +616,36 @@ namespace eval shellfilter::chan { method flush {transform_handle} { return "" } + #method write {transform_handle bytes} { + # #broken due to occasional unexpected byte sequence + # set instring [encoding convertfrom $o_enc $bytes] + # set outstring [punk::ansi::ansistrip $instring] + # return [encoding convertto $o_enc $outstring] + #} method write {transform_handle bytes} { - set instring [encoding convertfrom $o_enc $bytes] - set outstring [punk::ansi::ansistrip $instring] - return [encoding convertto $o_enc $outstring] + #set instring [tcl::encoding::convertfrom $o_enc $bytes] ;naive approach will break due to unexpected byte sequence - occasionally + #bytes can break at arbitrary points making encoding conversions invalid. + + set inputbytes $o_encbuf$bytes + set o_encbuf "" + set tail_offset 0 + while {$tail_offset < [string length $inputbytes] && [catch {tcl::encoding::convertfrom $o_enc [string range $inputbytes 0 end-$tail_offset]} stringdata]} { + incr tail_offset + } + if {$tail_offset > 0} { + if {$tail_offset < [string length $inputbytes]} { + #stringdata from catch statement must be a valid result + set t [expr {$tail_offset - 1}] + set o_encbuf [string range $inputbytes end-$t end] + } else { + set stringdata "" + set o_encbuf $inputbytes + return "" + } + } + + set outstring [punk::ansi::ansistrip $stringdata] + return [tcl::encoding::convertto $o_enc $outstring] } method meta_is_redirection {} { return $o_is_junction @@ -724,6 +752,8 @@ namespace eval shellfilter::chan { set emit "" if {[string last \x1b $buf] >= 0} { #detect will detect ansi SGR and gron groff and other codes + #REVIEW - ta::detect won't detect SOS without paired ST for things like PM + # ta::detectcode will - but then split_codes_single will treat unpaired SOS as text? if {[punk::ansi::ta::detect $buf]} { #split_codes_single regex faster than split_codes - but more resulting parts #'single' refers to number of escapes - but can still contain e.g multiple SGR codes (or mode set operations etc) @@ -1035,7 +1065,7 @@ namespace eval shellfilter::chan { } #todo - something oo::class create rebuffer { - variable o_trecord + variable o_trecord variable o_enc constructor {tf} { set o_trecord $tf diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm index de0164cd..97969463 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/bootsupport/modules/textblock-0.1.3.tm @@ -2528,7 +2528,7 @@ tcl::namespace::eval textblock { set ansiborder_final $ansiborder_body_col_row #$c will always have ansi resets due to overtype behaviour ? #todo - review overtype - if {[punk::ansi::ta::detect $c]} { + if {[punk::ansi::ta::detectcode $c]} { #use only the last ansi sequence in the cell value #Filter out foreground and use background for ansiborder override set parts [punk::ansi::ta::split_codes_single $c] @@ -4377,6 +4377,7 @@ tcl::namespace::eval textblock { if {![punk::ansi::ta::detect $block]} { return $block } + #could be newine in SOS/PM? review - terminal should theoretically ignore all chars til close of string foreach ln [split $block \n] { set parts [punk::ansi::ta::split_codes $ln] if {[lindex $parts 0] eq ""} { @@ -4894,7 +4895,9 @@ tcl::namespace::eval textblock { } set textblock [textutil::tabify::untabify2 $textblock $tw] } - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { #ansistripraw slightly faster than ansistrip - and won't affect width (avoid detect_g0/conversions) set textblock [punk::ansi::ansistripraw $textblock] } @@ -4917,7 +4920,9 @@ tcl::namespace::eval textblock { } else { set tl $textblock } - if {[punk::ansi::ta::detect $tl]} { + #review detectcode vs detect. detectcode won't look for completness of all codes to give a positive result + #ansistripraw splits on complete codes.. + if {[punk::ansi::ta::detectcode $tl]} { set tl [punk::ansi::ansistripraw $tl] } return [punk::char::ansifreestring_width $tl] @@ -4964,7 +4969,7 @@ tcl::namespace::eval textblock { set textblock [textutil::tabify::untabify2 $textblock $tw] } #ansistripraw on entire block in one go rather than line by line - result should be the same - review - make tests - if {[string length $textblock] > 1 && [punk::ansi::ta::detect $textblock]} { + if {[string length $textblock] > 1 && [punk::ansi::ta::detectcode $textblock]} { set textblock [punk::ansi::ansistripraw $textblock] } if {[tcl::string::last \n $textblock] >= 0} { @@ -5211,7 +5216,7 @@ tcl::namespace::eval textblock { set known_hasansi [tcl::dict::get $opts -known_hasansi] if {$known_hasansi eq ""} { - set block_has_ansi [punk::ansi::ta::detect $block] + set block_has_ansi [punk::ansi::ta::detectcode $block] } else { set block_has_ansi $known_hasansi } @@ -5648,7 +5653,7 @@ tcl::namespace::eval textblock { set rowcount 0 set blocklists [list] foreach b $blocks { - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set bl [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] } else { @@ -5676,7 +5681,7 @@ tcl::namespace::eval textblock { set i -1 foreach b $args { incr i - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { #-ansireplays 1 quite expensive e.g 7ms in 2024 set blines [punk::lib::lines_as_list -ansireplays 1 -ansiresets auto -- $b] } else { @@ -5799,7 +5804,7 @@ tcl::namespace::eval textblock { #testing of any decent size block line by line - even with max_string_length_line is slower than ta::detect anyway. #blocks passed to join can be ragged - so we can't pass -known_samewidth to pad - if {[punk::ansi::ta::detect $b]} { + if {[punk::ansi::ta::detectcode $b]} { # - we need to join to use pad - even though we then need to immediately resplit REVIEW (make line list version of pad?) set replay_block [::join [punk::lib::lines_as_list -ansireplays 1 -ansiresets $ansiresets -- $b] \n] #set blines [split [textblock::pad $replay_block -known_hasansi 1 -known_samewidth $is_samewidth -known_blockwidth $bwidth -width $bwidth -which right -padchar " "] \n] @@ -8650,7 +8655,7 @@ tcl::namespace::eval textblock { set cache_inner [overtype::block -blockalign $opt_blockalign -ellipsis $opt_ellipsis $underlay $cache_contentpattern] #puts "frame--->ansiwrap -rawansi [ansistring VIEW $opt_ansibase] $cache_inner" if {$opt_ansibase ne ""} { - if {[punk::ansi::ta::detect $cache_inner]} { + if {[punk::ansi::ta::detectcode $cache_inner]} { #set cache_inner [punk::ansi::ansiwrap -rawansi $opt_ansibase $cache_inner] #set cache_inner [punk::ansi::ansiwrap_raw $opt_ansibase "" "" $cache_inner] #jjj ??? review @@ -8745,7 +8750,7 @@ tcl::namespace::eval textblock { #set cwidth [textblock::width $contents] #JJJ - set contents_has_ansi [punk::ansi::ta::detect $contents] + set contents_has_ansi [punk::ansi::ta::detectcode $contents] if {$opt_ansibase ne ""} { if {$contents_has_ansi} { #set contents [punk::ansi::ansiwrap -rawansi $opt_ansibase $contents] @@ -8799,8 +8804,9 @@ tcl::namespace::eval textblock { if {$subposn >= 0} { set content_line [lindex $clines $contentindex] #review - different forms of reset e.g \x1b\[m ?? - if {[string range $content_line 0 3] eq "\x1b\[0m"} { + if {[tcl::string::range $content_line 0 3] eq "\x1b\[0m"} { set content_line [tcl::string::range $content_line 4 end] + #::tcl::string::replace content_line 0 3 } append content_line $opt_ansibase append fs [tcl::string::replace $tline $subposn $subposn+$pattern_offset $content_line] \n diff --git a/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl b/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl index 8cb8153c..c3cbc9aa 100644 --- a/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl +++ b/src/project_layouts/custom/_project/punk.shell-0.1/src/make.tcl @@ -3075,7 +3075,7 @@ foreach vfstail $vfs_tails { #copy the version that is mounted in this runtime to vfsname.new if {[catch { file copy -force $building_runtime $buildfolder/$vfsname.new - catch {exec chmod +x $buildfolder/$vfsname.new} + catch {exec chmod +w $buildfolder/$vfsname.new} } errM]} { puts stderr "$kit_type 'file copy -force $building_runtime $buildfolder/$vfsname.new' failed\n$errM" error $errM diff --git a/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm b/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm index fc438d57..a6224c0d 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/args-0.2.tm @@ -1364,6 +1364,11 @@ tcl::namespace::eval punk::args { #} tcl::dict::set tmp_optspec_defaults -type $v } + -defaultdisplaytype { + #how the -default is displayed + #-default doesn't have to be the same type as -type which validates user input that is not defaulted. + tcl::dict::set tmp_optspec_defaults -defaultdisplaytype $v + } -parsekey { tcl::dict::set tmp_optspec_defaults -parsekey $v @@ -1441,7 +1446,7 @@ tcl::namespace::eval punk::args { default { set known { -parsekey -group -grouphelp\ -any -anyopts -arbitrary -form -minsize -maxsize -choices -choicegroups -choicemultiple -choicecolumns -choicelabels -choiceinfo\ - -type -range -typeranges -default -typedefaults + -type -range -typeranges -default -defaultdisplaytype -typedefaults -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted -nocase\ -nominsize -nomaxsize -norange -nochoices -nochoicelabels\ -type -optional -allow_ansi -validate_ansistripped -strip_ansi -multiple -prefix\ @@ -2052,6 +2057,9 @@ tcl::namespace::eval punk::args { tcl::dict::set spec_merged -optional 1 } } + -defaultdisplaytype { + tcl::dict::set spec_merged -defaultdisplaytype $specval + } -typedefaults { set typecount [llength [tcl::dict::get $spec_merged -type]] if {$typecount != [llength $specval]} { @@ -2114,7 +2122,7 @@ tcl::namespace::eval punk::args { -form -type\ -parsekey -group\ -range -typeranges\ - -default -typedefaults\ + -default -defaultdisplaytype -typedefaults\ -minsize -maxsize -choices -choicegroups\ -choicemultiple -choicecolumns -choiceprefix -choiceprefixdenylist -choiceprefixreservelist -choicerestricted\ -nocase -optional -multiple -validate_ansistripped -allow_ansi -strip_ansi -help\ @@ -3784,7 +3792,32 @@ tcl::namespace::eval punk::args { } if {[dict exists $arginfo -default]} { - set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + #default isn't necessarily of same type as -type required for validation + #Guessing at the type from the data is likely to be unsatisfactory. + + set defaultdisplaytype [Dict_getdef $arginfo -defaultdisplaytype string] + switch -- $defaultdisplaytype { + dict { + #single level + set rawdefault [dict get $arginfo -default] + set default "{\n" + dict for {k v} $rawdefault { + append default " \"$k\" \"$v\"\n" + } + append default "}" + } + list { + set default "{\n" + foreach v $rawdefault { + append default " \"$v\"\n" + } + append default "}" + } + default { + #set default "'$A_DEFAULT[dict get $arginfo -default]$RST'" + set default "'[dict get $arginfo -default]'" + } + } } else { set default "" } diff --git a/src/vfs/_vfscommon.vfs/modules/punk/pdf-0.1.0.tm b/src/vfs/_vfscommon.vfs/modules/punk/pdf-0.1.0.tm index 4fde460e..15767ef5 100644 --- a/src/vfs/_vfscommon.vfs/modules/punk/pdf-0.1.0.tm +++ b/src/vfs/_vfscommon.vfs/modules/punk/pdf-0.1.0.tm @@ -82,70 +82,126 @@ namespace eval ::punk::pdf { variable base_x_per_c 4 ;#fixed width output font's x_per_c. variable cfg_blocksep_default "#pdf::text:blockstart type %type% pageindex %pageindex% blockindex %blockindex% y-index %y-index% tlc {%x0% %y0%} brc {%brc%} warnings %warnings% : %marker%" variable cfg_pagesep_default "#pdf::text:pagestart index %index% pageblocks %pageblocks% textblocks %textblocks% imageblocks %imageblocks% : [a bold green]PAGE[a]" + variable cfg_header_default + set cfg_header_default \ +"#pdf::text::header doc %doc% pages %pages% size %size% sha1 %sha1% punk::pdf %parserversion% +# args +# %args%" + + variable callid 0 variable results + variable test array set results {} punk::args::define { @id -id ::punk::pdf::text - @cmd -name punk::pdf::text -help\ + @cmd -name punk::pdf::text\ + -summary\ + "Extract text from non-image parts of a PDF"\ + -help\ "Extract text lines from a pdf - (No OCR - will only retrieve actual text contents)" + (No OCR - will only retrieve actual text contents) + The text function can only operate on the text spans provided by the underlying + engine (MuPDF) - which in some cases (for example spaces adjacent to brackets in + PDF32000_2008.pdf p24 eg4) does not interpret the number of whitespace characters + as a human reader would. The text data supplied for formulas such as in 7.10.1 of + the above document can be missing symbols such as the square-root symbol. This would + have to be converted to a unicode square root symbol with additional bracketing to + maintain meaning. The positioning of subscript numerals would also need to be + processed - and without font size information from tclMuPDF - this is currently + not practical to process reasonably. Such formulas will come out mangled. + " @leaders -min 0 -max 0 @opts -min 0 -max 2 -p|-page_indexes -parsekey -page_indexes -type string -default "0.." -help\ "Comma delimited list of indexes and/or ranges specifying which pages to output. - The indexes are 0-based. - Ranges must be specified with .. as the separator. - end means the last page. - end-1 means the second last page. - 0.. is the same as 0..end. - examples: - 1,3.. - output the page index 1 (2nd page) followed by all from index 3 to the end. - 0-2,end - output the first 3 pages, and the last page. - end-1..0 - output the pages in reverse order from 2nd last page to first page." + The indexes are 0-based. + Ranges must be specified with .. as the separator. + end means the last page. + end-1 means the second last page. + 0.. is the same as .. and is the same as 0..end + examples: + 1,3.. + output the page index 1 (2nd page) followed by all from index 3 to the end. + 0..2,end + output the first 3 pages, and the last page. + end-1..0 + output the pages in reverse order from 2nd last page to first page." -b|-block_indexes -parsekey -block_indexes -type string -default "0.." -help\ "Comma delimited list of indexes and/or ranges specifying which blocks to output. - Format is as per -page_indexes" - -merge_yblocks -default true -help\ - "Whether to merge blocks vertically when they have the same starting y-index. - This usually makes the output more in line with the source document - but - in some scenarios it might be desired to keep the blocks listed vertically - even though they appear side by side in the source." + Format is as per -page_indexes" + -merge_yblocks -default false -help\ + "Whether to merge blocks vertically when they interleave in the y direction. + This usually makes the output more in line with the source document - but + in some scenarios it might be desired to keep the blocks listed vertically + even though they appear side by side in the source. Such vertical listing as + given by default when -merge_yblocks is false, does not necessarily match the + left to right order. + Note that especially when there is small-font text with other text to the right, + The small-font text that is not rightmost may be truncated (with trailing ellipsis) + to fit it into the vertically merged layout. Such truncation produces warnings on + the -errchannel and increments the warning count visible in the -sep-block line + preceding the merged blocks. + Smaller print that doesn't fit in the layout is sometimes not required to extract + the main data from a document such as an invoice - and the merged data can make + other elements more amenable to parsing. In practice, 2 passes with -merge_yblocks + configured on and off, combined with -sep_block data and a set of custom rules + based on the specific document set may be required for information extraction." + #------------------------------------------------------------------------------------ -compact -type boolean -default false -help\ "If compact is false - blank lines approximating the vertical spacing in the source document will be emitted between each block. If -merged_yblocks is true - there will still be potential vertical whitespace within any merged blocks, as the point of -merge_yblocks is to align elements - closely to that within the source document, but there will be no vertical layout - spacing between such blocks, or other non-merging blocks." - -image_place_ansi -type dict -default {border Term-grey78 inner Term-grey58 small Term-grey15} -help\ - "3-element list of ANSI colour codes for the textual image placeholders. + closely to that within the source document, but there will be no vertical + layout spacing between such merged blocks, or other non-merging blocks. + This setting is redundant if -postcompact is true." + -postcompact -type boolean -default false -help\ + "Strip out vertical spacing (empty or whitespace-only lines) from all blocks. + This sets -compact to true, and removes vertical gaps from any merged blocks. + The most vertically compact representation of the text will result from setting + -merge true, -postcompact true and -image_place_ansi to the empty string, as + well as omitting (or setting to empty string) -sep_page and -sep_block. + Note that if the source PDF actually positioned blank lines as strings of + whitespace, these too would be removed. + If -image_place_ansi is left as the default or has other ansi codes, the + image placeholders will not be stripped, even if they consist of the default + space characters. + If -sep_block or -sep_page are set to one or more spaces - these blank lines + will still be emitted." + -shrink_textfree_blocks -type boolean -default true -help\ + "If a block is image only - don't output the placeholders. + These placeholders will still be emitted if -merge_yblocks is true and they + are in a merged block that has also merged text blocks. Set -image_place_ansi + to the empty string to suppress ANSI background for all image placeholders." + -image_place_ansi -type dict -defaultdisplaytype dict -default {border Term-grey78 inner Term-grey58 small Term-grey15} -help\ + "Dict of ANSI colour codes for the textual image placeholders. Colour codes are those as listed by 'punk::ansi::a?' - If the default -image_place_chars values of the space character are being used, the colour - names should begin with a capital letter (indicating background as opposed to foreground colour). + If the default -image_place_chars values of the space character are being used, + the colour names should begin with a capital letter (indicating background as + opposed to foreground colour). The placeholders are by default space characters with the ANSI applied. - This means that just by stripping ANSI from the output, layout is maintained, and the - text can be processed simply, or the ANSI can be retained if your text parser wishes - to use it to make decisions. - The colour at element 'border' is for the border colour, 'inner' for the area within the border, - and the colour at element 'small' is for images too small for border representation. The border - bears no relation to the underlying image - and is merely to aid in seeing when images overlap - or are cropped etc. The dimensions of the placeholder are approximates only to the source - images, as they are in multiples of the fixed-width output font character width and - line height. - Setting -image_place_ansi to an empty value, or -image_place_chars to an empty value - will stop ANSI from being output at the image locations." - -image_place_chars -type dict -default {tlc " " hlt " " trc " " vlr " " brc " " hlb " " blc " " vll " " inner " " small " "} -help\ - "Characters to use for image placeholders. The default list of empty spaces is recommended - for ease of visualisation and output parsing assuming an function such as punk::ansi::ansistrip - is available. An alternative set of characters such as a unicode block character might be useful - alongside turning ANSI off by setting -image_place_ansi to empty; if the unicode character is known - to be outside the domain of characters in the source document, and it is desired to use such chars - to detect approximate image position in relation to text. + This means that just by stripping ANSI from the output, layout is maintained, + and the text can be processed simply, or the ANSI can be retained if your text + parser wishes to use it to make decisions. + The colour at element 'border' is for the border colour, 'inner' for the area + within the border, and the colour at element 'small' is for images too small for + border representation. The border bears no relation to the underlying image - and + is merely to aid in seeing when images overlap or are cropped etc. The dimensions + of the placeholder are approximates only to the source images, as they are in + multiples of the fixed-width output font character width and line height. + Setting -image_place_ansi to an empty value, or -image_place_chars to an empty + value will stop ANSI from being output at the image locations." + #------------------------------------------------------------------------------------ + -image_place_chars -type dict -defaultdisplaytype dict -default {tlc " " hlt " " trc " " vlr " " brc " " hlb " " blc " " vll " " inner " " small " "} -help\ + "Characters to use for image placeholders. The default list of empty spaces is + recommended for ease of visualisation and output parsing assuming an function + such as punk::ansi::ansistrip is available. An alternative set of characters + such as a unicode block character might be useful alongside turning ANSI off by + setting -image_place_ansi to empty; if the unicode character is known to be + outside the domain of characters in the source document, and it is desired to + use such chars to detect approximate image position in relation to text. tlc - border top left corner hlt - border top horizontal lines trc - border top right corner @@ -157,40 +213,51 @@ namespace eval ::punk::pdf { inner - image area within border small - for images too small for borders " + -header -type string -default "" -help\ + "Header line to record details of the conversion run. + Set to 'default' to use the provided default template. + Substitutions available: + %doc% - full path of source pdf + %pages% - number of pages in the pdf + %size% - size in bytes of the input pdf file + %sha1% - sha1 checksum of the input pdf file + %parserversion% - version of punk::pdf used. + %args% - arguments supplied to text command. + %default% - the default header template - for extending." -sep_page -type string -default "" -help\ "Send a line of output to stdout at the beginning of each - page in the PDF. - If set to the value: default - a default separator with some added info will be used. - line begins with: #pdf::text:pagestart - Substitutions available: - %index% 0-based index of the page (not the page number as in the PDF) - %pageblocks% total number of blocks in the page, includes both - text and image blocks. - %textblocks% number of text blocks in the page - %imageblocks% number of image blocks in the page - %nl% newline - %default% The default page sep - to allow extending." + page in the PDF. + If set to the value: default + a default separator with some added info will be used. + line begins with: #pdf::text:pagestart + Substitutions available: + %index% 0-based index of the page (not the page number as in the PDF) + %pageblocks% total number of blocks in the page, includes both + text and image blocks. + %textblocks% number of text blocks in the page + %imageblocks% number of image blocks in the page + %nl% newline + %default% The default page sep - to allow extending." -sep_block -type string -default "" -help\ "Send a line of output to stdout at the beginning of each - textblock in the PDF. This can aid in parsing. - If set to the the value: default - a default separator with some added info will be used. - line begins with: #pdf::text:blockstart - Substitutions available: - %type% type of block text or image - %pageindex% 0-based index of the page (not the page number as in the PDF) - %y-index% 0-based index of block after the textblock list has - been sorted by y0 - %x0% first x coordinate of the block - %y0% first y coordinate of the block - %x1% second x coordinate of the block - %y1% second y coordinate of the block - %nl% newline - %warnings% Number of warnings emitted to stderr for the block. - These are normally for text overlay attempts, which should - only affect whitespace, but should be checked. - %default% The default blocksep - to allow extending." + textblock in the PDF. This can aid in parsing. + If set to the the value: default + a default separator with some added info will be used. + line begins with: #pdf::text:blockstart + Substitutions available: + %type% type of block text or image + %pageindex% 0-based index of the page (not the page number as in the PDF) + %y-index% 0-based index of block after the textblock list has + been sorted by y0 + %x0% first x coordinate of the block + %y0% first y coordinate of the block + %x1% second x coordinate of the block + %y1% second y coordinate of the block + %nl% newline + %warnings% Number of warnings emitted to stderr for the block. + These are normally for text overlay attempts, which should + only affect whitespace, but should be checked. + %default% The default blocksep - to allow extending." -outchannel -type string -default stdout -choicerestricted 0 -choices {return null stderr stdout}\ -choicelabels { return\ @@ -215,33 +282,33 @@ namespace eval ::punk::pdf { any open Tcl channel can be used." -debug_pageblock -default "" -type list -minsize 2 -maxsize 2 -help\ "A 2-element list of the page then block index(s) for which to output - extra information on stderr. - Supplied value must contain 2 elements {indexset indexset} - The membership selection is performed the same way as -page_indexes. - e.g {.. end-1} means second-last block on every page." + extra information on stderr. + Supplied value if not empty, must contain 2 elements {indexset indexset} + The membership selection is performed the same way as -page_indexes. + e.g {.. end-1} means second-last block on every page." -debug_highlight -default "red bold" -help\ "ANSI codes as understood by punk::ansi (see punk::ansi::a?) - These are used to colourise output if -debug_pageblock has been set" + These are used to colourise output if -debug_pageblock has been set" -highlight_overwrites -default "bold" -help\ "Ansi codes as understood by punk::ansi (see punk::ansi::a?) - These are used to mark text elements which have been overlayed with - the same text data. This may occur for example in table rows in the source - document that have alternating colour or emphasis. - Having this ANSI applied can be useful when parsing the output to aid in - grouping lines. e.g by using [punk::ansi::ta::detect ] or similar. - - By applying ansi, the layout is not disturbed when viewing in a terminal, - and may be more visually similar to the source document. - Saved output can always be passed through an ansistrip function if needed, - or the default -highlight_overwrites can be overridden by supplying an empty - string." - + These are used to mark text elements which have been overlayed with + the same text data. This may occur for example in table rows in the source + document that have alternating colour or emphasis. + Having this ANSI applied can be useful when parsing the output to aid in + grouping lines. e.g by using [punk::ansi::ta::detect ] or similar. + + By applying ansi, the layout is not disturbed when viewing in a terminal, + and may be more visually similar to the source document. + Saved output can always be passed through an ansistrip function if needed, + or the default -highlight_overwrites can be overridden by supplying an empty + string." + -engine -type any -default MuPDF -choices {MuPDF} -help\ "Only MuPDF is currently implemented. Be aware that while this script and tclMuPDF - are MIT/BSD licensed, the underlying MuPDF library is AGPL - which could place - some restrictions on commercial use." + are MIT/BSD licensed, the underlying MuPDF library is AGPL - which could place + some restrictions on commercial use." -warnings_engine -default 0 -choices {0 1 2}\ -choicelabels { 0\ @@ -253,7 +320,8 @@ namespace eval ::punk::pdf { }\ -help\ "Whether to display internal warnings regarding the state of the PDF document. - The underlying MuPDF library can in many cases work with bad/corrupted PDFS." + The underlying MuPDF library can in many cases work with bad/corrupted PDFS, + and if so, may emit warnings." -warnings_textblock -default 2 -type integer -range {0 9} -help\ "0 to disable. 1 for least info, 9 for most. @@ -263,12 +331,20 @@ namespace eval ::punk::pdf { "Path to the .pdf file to parse" } proc text {args} { + variable callid incr callid + + package require tcl::chan::variable + #variable testchan + #set testchan [::tcl::chan::variable ::punk::pdf::test($callid)] + #chan configure $testchan -translation lf + variable results variable base_x_per_c variable cfg_blocksep_default variable cfg_pagesep_default + variable cfg_header_default set x_per_line [expr {$base_x_per_c * 2}] ;#hack for now set argd [punk::args::parse $args withid ::punk::pdf::text] @@ -282,6 +358,10 @@ namespace eval ::punk::pdf { if {$opt_blocksep eq "default"} { set opt_blocksep $cfg_blocksep_default } + set opt_header [dict get $opts -header] + if {$opt_header eq "default"} { + set opt_header $cfg_header_default + } set debug_pageblock [dict get $opts -debug_pageblock] set debug_highlight [string trim [dict get $opts -debug_highlight]] set highlight_overwrites [string trim [dict get $opts -highlight_overwrites]] @@ -289,8 +369,14 @@ namespace eval ::punk::pdf { set block_indexes [string trim [dict get $opts -block_indexes]] set warnings_engine [dict get $opts -warnings_engine] set opt_compact [dict get $opts -compact] + set opt_postcompact [dict get $opts -postcompact] + if {$opt_postcompact} { + set opt_compact 1 ;#don't emit inter-block spacing in the first place + #will also strip vertical space merged blocks + } set opt_image_place_ansi [dict get $opts -image_place_ansi] set opt_image_place_chars [dict get $opts -image_place_chars] + set opt_shrink_textfree_blocks [dict get $opts -shrink_textfree_blocks] if {$warnings_engine == 2} { #turn on printwarnings before we start ::mupdf::printwarnings 1 @@ -302,6 +388,8 @@ namespace eval ::punk::pdf { return { package require tcl::chan::variable set outchan [::tcl::chan::variable ::punk::pdf::results($callid)] + #chan configure $outchan -translation binary -encoding utf-8 + chan configure $outchan -translation lf -encoding utf-8 } null { package require tcl::chan::null @@ -320,7 +408,8 @@ namespace eval ::punk::pdf { } else { set errchan [::tcl::chan::variable ::punk::pdf::results($callid)] } - chan configure $errchan -buffering none + #chan configure $errchan -buffering none -translation binary + chan configure $errchan -buffering none -translation lf -encoding utf-8 } null { package require tcl::chan::null @@ -347,6 +436,13 @@ namespace eval ::punk::pdf { set d [::mupdf::open $fname] set npages [$d npages] + if {$opt_header ne ""} { + set opt_header [string map [list %default% $cfg_header_default] $opt_header] + package require sha1 + set hash [sha1::sha1 -hex -file $fname] + set map [list %nl% \n %crlf% \r\n %doc% $fname %pages% $npages %size% [file size $fname] %sha1% $hash %args% $args %parserversion% [package present punk::pdf]] + puts $outchan [string map $map $opt_header] + } set page_index_list [punk::lib::indexset_resolve $npages $page_indexes] set debug_pageblock_pages [punk::lib::indexset_resolve $npages [lindex $debug_pageblock 0]] @@ -495,21 +591,33 @@ namespace eval ::punk::pdf { # } #} #------------ - set cur_merge_base_fields [dict get [lindex $MERGE_BUFFER 0 0] header fields] - if {$bny0 >= [dict get $cur_merge_base_fields y1]} { - #this block begins below the entire MERGE_BUFFER - time to render it - _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $warnings_textblock $outchan $errchan - if {!$opt_compact} { - #default: -compact false - set vdist [expr {$bny0 - [dict get $cur_merge_base_fields y1]}] - set vdistlines [expr {int(ceil($vdist / $x_per_line))}] - if {$vdistlines > 0} { - set vseparator [string repeat \n $vdistlines] - puts -nonewline $outchan $vseparator + #puts "MB dimensions: [punk::pdf::system::merge_buffer_dimensions MERGE_BUFFER]" + set mbdimensions [punk::pdf::system::merge_buffer_dimensions MERGE_BUFFER] ;#list x0 y0 x1 y1 + #set cur_merge_base_fields [dict get [lindex $MERGE_BUFFER 0 0] header fields] + #if {$bny0 >= [dict get $cur_merge_base_fields y1]} {} + if {$bny0 >= [lindex $mbdimensions 3]} { + + #this block begins below the entire MERGE_BUFFER - time to render previous block + _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $opt_shrink_textfree_blocks $opt_postcompact $warnings_textblock $outchan $errchan + + if {$tbnum in $block_index_list} { + #now this block's offset from the top + if {!$opt_compact} { + #default: -compact false + #set vdist [expr {$bny0 - [dict get $cur_merge_base_fields y1]}] + set vdist [expr {$bny0 - [lindex $mbdimensions 3]}] + #set vdistlines [expr {int(ceil($vdist / $x_per_line))}] + set vdistlines [expr {int(floor($vdist / $x_per_line))}] + if {$vdistlines > 0} { + set vseparator [string repeat \n $vdistlines] + puts -nonewline $outchan $vseparator + } } } } else { - lappend MERGE_BUFFER [list] + if {$tbnum in $block_index_list} { + lappend MERGE_BUFFER [list] + } } } @@ -577,9 +685,17 @@ namespace eval ::punk::pdf { set vertx [expr {$bny1 - $bny0}] set img_char_height [expr {int(ceil($vertx / $x_per_line))}] ;#linecount - #puts "-- imgstart_char_pos:$imgstart_char_pos img_char_width:$img_char_width" + #puts "-- img_char_height:$img_char_height img_char:width: $img_char_width" + if {$img_char_height == 0} { + puts $errchan "WARNING type debug msg {zero height image}" ;#todo + incr block_warning_count ;#0 based + #nothing to render for a zero height img + #MERGE_BUFFER will have header + lset MERGE_BUFFER end end+1 [list warnings $block_warning_count] + continue + } - #put a border around the image if it's big enough + #put a border around the image if it's big enough #if it can't be bordered use a different colour #this gives us 3 greys applied to space elements, to aid in seeing image overlaps and positioning. #all can be easily stripped out with ansistrip to leave only spaces for text processing @@ -629,14 +745,24 @@ namespace eval ::punk::pdf { set a_inner [dict get $opt_image_place_ansi inner] set b [textblock::frame -type [list tlc $c_tlc hlt $c_hlt trc $c_trc vlr $c_vlr brc $c_brc hlb $c_hlb blc $c_blc vll $c_vll] -ansiborder [a $a_border] [punk::ansi::ansiwrap $a_inner $inner]] } - } + } #set b [textblock::block $img_char_width $img_char_height " "] #set b [punk::ansi::ansiwrap Term-102 $b] set img_indent [textblock::block [expr {max(0,$imgstart_char_pos)}] $img_char_height \uFFFD] if {$imgstart_char_pos < 0} { #img representation is effectively cropped at left #img_indent will just be a list of newlines to match height + + #even if img_indent and b have no ansi - result of renderspace will have ansi resets (todo review overtype::renderspace) + set had_ansi 0 + if {[punk::ansi::ta::detect $b]} { + set had_ansi 1 + } set positioned_block [overtype::renderspace -insert_mode 0 -expand_right 1 -startcolumn $imgstart_char_pos $img_indent $b] + if {!$had_ansi} { + #if user disabled ansi on image_placeholders - we need to ensure it still has none after renderspace + set positioned_block [punk::ansi::ansistrip $positioned_block] + } } else { set positioned_block [textblock::join -- $img_indent $b] } @@ -663,7 +789,7 @@ namespace eval ::punk::pdf { #(common to encounter line data where only some text elements are within the block we are currently at - e.g tables) #todo - review treatment of #assume if it just extends out of the blocks x range - it's part of a larger block (?) review - xrangefudge? - set has_element_in_range 0 + set has_element_in_range 0 foreach xxt $ydata { lassign $xxt dx0 dx1 set xrangefudge 0 ;#shouldn't be needed? @@ -682,7 +808,7 @@ namespace eval ::punk::pdf { lassign $debug_pageblock dp db if {$pageindex in $debug_pageblock_pages && $tbnum in $debug_pageblock_blocks} { - set debuginfo "b:$tbnum yinfo: $yinfo ydata: $ydata" + set debuginfo "b:$tbnum yinfo: $yinfo ydata: [punk::ansi::ansistring VIEW -lf 1 $ydata]" if {$debug_highlight ne ""} { set debuginfo "[punk::ansi::a {*}$debug_highlight]$debuginfo\x1b\[m" } @@ -717,23 +843,44 @@ namespace eval ::punk::pdf { set existing_y0s [lsearch -all -inline -index 0 -subindices $vfudged_lines *] set existing_y1s [lsearch -all -inline -index 1 -subindices $vfudged_lines *] set vfudged_line_added 0 + #set vfidx -1 + #foreach ey0 $existing_y0s { + # incr vfidx + # set vdiff [expr {abs($line_y0 - $ey0)}] + # #x units per char is normally narrower than height + # set linefudge 1.5 ;#choose linefudge less than assumed x_per_line of 2*base_x_per_c - or small font lines will not be separated #REVIEW + # if {$vdiff < ($linefudge * $base_x_per_c)} { + # #normalize to existing value + # #line_y1 unnormalised?? + # #if y1 vdiff is about the same - normalise it too + # set ey1 [lindex $existing_y1s $vfidx] + # set vdiff2 [expr {abs($line_y1 -$ey1)}] + # if {abs($vdiff2 - $vdiff) <= 8} { + # set line_y1 $ey1 + # } + # set vfudged_line_added 1 + # lappend vfudged_lines [list $ey0 $line_y1 [lindex $fl 2]] + # break + # } + #} + + #when mixing fonts in same line - aligning the bases is more common. set vfidx -1 - foreach ey0 $existing_y0s { + foreach ey1 $existing_y1s { incr vfidx - set vdiff [expr {abs($line_y0 - $ey0)}] + set vdiff [expr {abs($line_y1 - $ey1)}] #x units per char is normally narrower than height set linefudge 1.5 ;#choose linefudge less than assumed x_per_line of 2*base_x_per_c - or small font lines will not be separated #REVIEW if {$vdiff < ($linefudge * $base_x_per_c)} { #normalize to existing value - #line_y1 unnormalised?? - #if y1 vdiff is about the same - normalise it too - set ey1 [lindex $existing_y1s $vfidx] - set vdiff2 [expr {abs($line_y1 -$ey1)}] + #if y0 vdiff is about the same - normalise it too + set ey0 [lindex $existing_y0s $vfidx] + set vdiff2 [expr {abs($line_y0 -$ey0)}] if {abs($vdiff2 - $vdiff) <= 8} { - set line_y1 $ey1 + set line_y0 $ey0 } set vfudged_line_added 1 - lappend vfudged_lines [list $ey0 $line_y1 [lindex $fl 2]] + lappend vfudged_lines [list $line_y0 $ey1 [lindex $fl 2]] break } } @@ -774,7 +921,7 @@ namespace eval ::punk::pdf { incr block_linenum #puts "fline: $fl" lassign $fl fy0 fy1 spans_by_x0 ;#don't pull out spans_by_x0_adjusted - it's adjusted within this loop by referring to its index in working_lines - + set line_chunks [list] #calculating x units per char for each string is only really possible if we know the exact font and size set xidx -1 ;#index of spans in line, which are sorted by x0 (lhs) @@ -783,6 +930,8 @@ namespace eval ::punk::pdf { incr xidx lassign $xxt tx0 tx1 text + #puts -nonewline $testchan $text + #any span, including first may have rhs alignments with spans from previous lines in the same block #look at original rhs alignment of spans in previous lines that match this one's x1 @@ -850,6 +999,7 @@ namespace eval ::punk::pdf { #1st shot - use rhs of element immediately to the left (either existing TESTBLOCK or earliear spans on same line) if {$xidx == 0} { + set prevspan_x0 "" set prevspan_x1 "" set prevspan_x1_adjusted "" if {$TESTBLOCK ne ""} { @@ -888,6 +1038,7 @@ namespace eval ::punk::pdf { } else { #set prevspan_x1_adjusted [lindex $spans_by_x0_adjusted $xidx-1 1] + set prevspan_x0 [lindex $spans_by_x0 $xidx-1 0] set prevspan_x1_adjusted [lindex $working_lines $block_linenum 3 $xidx-1 1] set prevspan_x1 [lindex $spans_by_x0 $xidx-1 1] #set effective_prev_x1 [expr {max($prevspan_x1,$prevspan_x1_adjusted)}] @@ -983,7 +1134,7 @@ namespace eval ::punk::pdf { if {$pageindex in $debug_pageblock_pages && $tbnum in $debug_pageblock_blocks} { set debuginfo "span x0 xx1 text: [lindex $xxt 0] [lindex $xxt 1] [punk::ansi::a normal]'[lindex $xxt 2]'\x1b\[m" if {$debug_highlight ne ""} { - set debuginfo "[punk::ansi::a underline {*}$debug_highlight]$debuginfo\x1b\[m" + set debuginfo "[punk::ansi::a overline {*}$debug_highlight]$debuginfo\x1b\[m" } puts $errchan $debuginfo if {$prevspan_x1 ne $prevspan_x1_adjusted} { @@ -992,9 +1143,9 @@ namespace eval ::punk::pdf { } else { set C "" ; set RST "" } - set debuginfo "prevspan_x1:$prevspan_x1 prevspan_x1_adjusted: $C$prevspan_x1_adjusted$RST effective_prev_x1: $effective_prev_x1" + set debuginfo "prev x0 x1 : $prevspan_x0 $prevspan_x1 prevspan_x1_adjusted: $C$prevspan_x1_adjusted$RST effective_prev_x1: $effective_prev_x1" if {$debug_highlight ne ""} { - set debuginfo "[punk::ansi::a overline {*}$debug_highlight]$debuginfo\x1b\[m" + set debuginfo "[punk::ansi::ansiwrap underline {*}$debug_highlight $debuginfo]" } puts $errchan $debuginfo } @@ -1014,7 +1165,9 @@ namespace eval ::punk::pdf { #set buffer [string repeat " " $start_char_pos] #set x_gap_chars 0 } else { - set x_gap [expr {$tx0 - $prevspan_x1}] ;#should be positive (but could it be negative, ie overlapped?) + set x_gap [expr {$tx0 - $prevspan_x1}] ;#usually positive but can easily be negative. e.g the word TAX in some fonts can have the left of the A slightly under the top bar of the T + #if the font is large - this negative offset could be more than an entire width of our output character size. + #we really need to know the source font to do this even close to properly - REVIEW set x_gap_chars [x_to_c $x_gap] ;#used to expand buffer below during collision detection. #set buffer [string repeat "^" $x_gap_chars] } @@ -1022,14 +1175,19 @@ namespace eval ::punk::pdf { #if {$xidx > 0} { # set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1] + $x_gap_chars}]] #} - - set text_points [c_to_x [string length $text]] - set tend [expr {$tx0 + $text_points}] - set remspace [expr {$tx1 - $tend}] - #set tailbuffer [string repeat "-" [x_to_c $remspace]] - set tailbuffer "" - - if {$xidx > 0 && $effective_prev_x1 ne "" && $tx0 <= $effective_prev_x1} { + if {$xidx > 0 && $x_gap >= -3 && $x_gap < 0} { + #treat small negative gap (overlap) in source (prev_x1 > tx0) (typically for character overhang/underhang) as adjacency + dict set page_seen_chunks $text 1 + set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] + lappend line_chunks "$buffer$text" + } elseif {$xidx > 0 && $x_gap >= 0 && $x_gap < ($base_x_per_c * 0.75)} { + #treat no, or small positive gap in source (prev_x1 < tx0) as intended adjacency + #.75 is yet another fudge for lack of font info - todo - rework to another engine? + #this branch could be merged with above, but kept separate for clarity and for specific comments. + dict set page_seen_chunks $text 1 + set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] + lappend line_chunks "$buffer$text" + } elseif {$xidx > 0 && $effective_prev_x1 ne "" && $tx0 <= $effective_prev_x1} { #possible overlap/overlay - left x of this data is before right x of previous #first use output font's x_per_c and see if only overlaying whitespace @@ -1075,6 +1233,8 @@ namespace eval ::punk::pdf { #set xcursor_prev [string length [punk::ansi::ansistrip [join [lrange $line_chunks 0 end-1] ""]]] #set buffer_prev [string repeat " " [expr {$startpos - $xcursor_prev}]] #lset line_chunks end "$buffer_prev[punk::ansi::a {*}$highlight_overwrites]$text[punk::ansi::a]" + + lset line_chunks end "$buffer[punk::ansi::a {*}$highlight_overwrites]$text[punk::ansi::a]" } #else: no need to overwrite same text if no ansi @@ -1110,7 +1270,7 @@ namespace eval ::punk::pdf { set expand [string repeat " " $expandchars] append buffer $expand # REVIEW - seems to work reasonably (e.g in FTP-1.1.1.pdf mentioned) - but we'll still output a warning - lappend line_chunks $buffer$text$tailbuffer + lappend line_chunks $buffer$text if {$char_overlap > 0 && $warnings_textblock >= 4} { puts $errchan "Warning pageindex [format %4s $pageindex] blockindex [format %4s $tbnum] blockwarning [format %4s $block_warning_count] type {text overlay}" @@ -1137,7 +1297,7 @@ namespace eval ::punk::pdf { if {$xidx > 0 && $x_gap_chars == 0} { set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] } - lappend line_chunks "$buffer$text$tailbuffer" + lappend line_chunks "$buffer$text" } } else { dict set page_seen_chunks $text 1 @@ -1147,7 +1307,7 @@ namespace eval ::punk::pdf { set buffer [string repeat " " [expr {[x_to_c $effective_prev_x1]}]] } } - lappend line_chunks "$buffer$text$tailbuffer" + lappend line_chunks "$buffer$text" } #position 3 is spans_by_x0_adjusted set chunk [lindex $line_chunks end] @@ -1208,10 +1368,10 @@ namespace eval ::punk::pdf { puts $outchan "____________________________$bx1,$by1" } } - _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $warnings_textblock $outchan $errchan if {[llength $MERGE_BUFFER]} { - puts $errchan "WARNING: MERGE_BUFFER not empty, but should be empty at end of page" - puts $outchan [punk::lib::showdict -roottype list $MERGE_BUFFER] + _process_merge_buffer MERGE_BUFFER $pageindex $cfg_blocksep_default $opt_blocksep $opt_merge_yblocks $opt_shrink_textfree_blocks $opt_postcompact $warnings_textblock $outchan $errchan + #puts $errchan "WARNING: MERGE_BUFFER not empty, but should be empty at end of page" + #puts $outchan [punk::lib::showdict -roottype list $MERGE_BUFFER] } } @@ -1219,12 +1379,15 @@ namespace eval ::punk::pdf { puts $errchan "MuPDF wasrepaired: [$d wasrepaired]" puts $errchan "MuPDF warnings : [$d warnings]" } + flush $outchan + flush $errchan if {$opt_outchan eq "return" || $opt_errchan eq "return"} { #if both are 'return' at the same time - the same channel is used for both - flush $outchan - flush $errchan - set result [set results($callid)] + set result [encoding convertfrom utf-8 [set results($callid)]] + #set result [set results($callid)] + return $result } + return } } @@ -1264,6 +1427,43 @@ tcl::namespace::eval punk::pdf::lib { return $result } + #subset of possible boms - just for experimenting + proc bom_ident {binstr} { + if {[string match \uFFEF* $binstr]} { + return bom + } + set first32 [string range $binstr 0 3] + set bytes [binary scan $binstr H2H2H2H2 a b c d] + switch -- $a { + fe { + if {"$b" eq "ff"} { + return utf-16be + } + } + ef { + if {"$b$c" eq "bbbf"} { + return utf-8 + } + } + ff { + if {$b eq "fe"} { + if {"$c$d" eq "0000"} { + return utf-32le + } else { + return utf-16le + } + } + } + default { + if {"$a$b$c$d" eq "0000feff"} { + return utf-32be + } + } + } + return unknown + } + + #points to number of chars - in terms of fixed-width output font proc x_to_c {x} { upvar ::punk::pdf::base_x_per_c base_x_per_c @@ -1277,20 +1477,50 @@ tcl::namespace::eval punk::pdf::lib { return [expr {$charcount * $base_x_per_c}] } + proc merge_buffer_dimensions {bufname} { + upvar $bufname MB + if {![llength $MB]} { + return [list 0 0 0 0] ;#x0 y0 x1 y1 + } + set firstheader [lindex $MB 0 0] + set flds [dict get [lindex $firstheader 1] fields] + set dimensions [list [dict get $flds x0] [dict get $flds y0] [dict get $flds x1] [dict get $flds y1]] + #we can get negative values e.g image offset to left or top + foreach sametopblocks $MB { + foreach B $sametopblocks { + if {[lindex $B 0] eq "header"} { + set flds [dict get [lindex $B 1] fields] + set this_x0 [dict get $flds x0] + set this_y0 [dict get $flds y0] + set this_x1 [dict get $flds x1] + set this_y1 [dict get $flds y1] + lset dimensions 0 [expr {min([lindex $dimensions 0],$this_x0)}] + lset dimensions 1 [expr {min([lindex $dimensions 1],$this_y0)}] + lset dimensions 2 [expr {max([lindex $dimensions 2],$this_x1)}] + lset dimensions 3 [expr {max([lindex $dimensions 3],$this_y1)}] + } + } + } + return $dimensions + } #whether merging on or off - we keep a MERGE_BUFFER #At every y-index change, and at end of page, process last entry(s) - proc _process_merge_buffer {bufname pageindex blocksep_default opt_blocksep domerge opt_warnings_textblock outc errc} { + proc _process_merge_buffer {bufname pageindex blocksep_default opt_blocksep domerge opt_shrink_textfree_blocks opt_postcompact opt_warnings_textblock outc errc} { upvar ::punk::pdf::base_x_per_c base_x_per_c + upvar ::punk::pdf::testchan testchan set x_per_line [expr {$base_x_per_c * 2}] ;#hack for now upvar $bufname MB if {!$domerge} { set block_warnings 0 + #puts "MB:'$MB'" set warningrecords [lsearch -all -inline -index 0 $MB warnings] foreach wr $warningrecords { incr block_warnings [lindex $wr 1] } #just emit the header & line blocks we have in the merge buffer one after the other + set blockresult "" ;#needed for postcompact + set emissions [list] ;#need to enable blank sep_block even when -postcompact is true foreach yset $MB { foreach B $yset { switch -- [lindex $B 0] { @@ -1298,22 +1528,72 @@ tcl::namespace::eval punk::pdf::lib { if {$opt_blocksep ne ""} { set sep [dict get [lindex $B 1] sep] set sep [string map [list $block_warnings] $sep] - puts $outc $sep + #puts $outc $sep + lappend emissions [list data $blockresult] + lappend emissions [list sep $sep] + set blockresult "" + #append blockresult $sep \n } } lines { set lines [lindex $B 1] + #puts $errc "--->[punk::ansi::ansistring VIEW $lines]" + set idx -1 foreach l $lines { - puts $outc $l + incr idx + if {[catch { + #puts $outc $l + append blockresult $l \n + } errMsg]} { + set prevline [lindex $lines $idx-1] + set nextline [lindex $lines $idx+1] + puts stderr "error writing output channel $outc\nerrMsg:$errMsg\n" + puts stderr " prevline:'[punk::ansi::ansistring VIEW -lf 1 $prevline]'" + puts stderr " lineview '[punk::ansi::ansistring VIEW -lf 1 $l]'" + puts stderr " nextline:'[punk::ansi::ansistring VIEW -lf 1 $nextline]'" + #puts $testchan "=----------------------------------------=" + #puts $testchan $l + #puts $testchan "=----------------------------------------=" + } } } block { - puts $outc [string map [list \uFFFD " "] [lindex $B 1]] + if {!$opt_shrink_textfree_blocks} { + #puts $outc [string map [list \uFFFD " "] [lindex $B 1]] + append blockresult [string map [list \uFFFD " "] [lindex $B 1]] \n + #puts $errc "--->[punk::ansi::ansistring VIEW [lindex $B 1]]" + } } warnings {} } } } + lappend emissions [list data $blockresult] + + set output "" + if {$opt_postcompact} { + foreach e $emissions { + lassign $e etype data_or_header + switch -- $etype { + sep { + append output $data_or_header \n + } + data { + foreach ln [split $data_or_header \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + } + } + } + puts -nonewline $outc $output + } else { + foreach e $emissions { + lassign $e _ data_or_header + puts -nonewline $outc $data_or_header\n + } + } set MB [list] return } @@ -1322,38 +1602,71 @@ tcl::namespace::eval punk::pdf::lib { set headers_structure [list] set mergedblock "" set warnings [list] - #puts "MB: $MB" if {[llength $MB]} { - set rootheader [lindex $MB 0 0 1] - #puts $errc "rootheader: $rootheader" - set base_y [dict get $rootheader fields y0] + #rootheader doesn't 'contain' subsequent blocks in x or y direction - it's just the y0 that triggers the new merge because it's below previous blocks + #set rootheader [lindex $MB 0 0 1] + ##puts $errc "rootheader: $rootheader" + #set base_y [dict get $rootheader fields y0] + set mbdimensions [merge_buffer_dimensions MB] ;# x0 y0 x1 y1 + set base_y [lindex $mbdimensions 1] + } + #loop through and determine which blocks of those to be merged are RHS e.g largest x1 + set max_x1 0 + #todo - further refine to find blocks in the mergeset with no textblocks to right? + set rhs_block_indexes [list] + set textblock_x0_list [list] + foreach sametopblocks $MB { + foreach B $sametopblocks { + if {[lindex $B 0] eq "header"} { + set flds [dict get [lindex $B 1] fields] + #puts "flds: $flds" + if {[dict get $flds type] eq "text"} { + set this_x1 [dict get $flds x1] + lappend textblock_x0_list [dict get $flds x0] + if {abs($this_x1 - $max_x1) < 1} { + lappend rhs_block_indexes [dict get $flds blockindex] + set max_x1 [expr {max($this_x1,$max_x1)}] + } elseif {$this_x1 > $max_x1} { + set max_x1 $this_x1 + set rhs_block_indexes [list [dict get $flds blockindex]] + } + } + } + } } - foreach ybuf $MB { + foreach sametop $MB { set yheaders [list] set y_index "" - if {[llength $ybuf]} { - set firstheader [lindex $ybuf 0 1] - set blockrow_y [dict get $firstheader fields y0] - set blockrow_offset [expr {$blockrow_y - $base_y}] - set blockrow_x0 [dict get $firstheader fields x0] - set blockrow_x1 [dict get $firstheader fields x1] + if {[llength $sametop]} { + set firstheader [lindex $sametop 0 1] + set blockrow_y0 [dict get $firstheader fields y0] + set blockrow_y1 [dict get $firstheader fields y1] + set blockrow_top_offset [expr {$blockrow_y0 - $base_y}] + #set blockrow_x0 [dict get $firstheader fields x0] + #set blockrow_x1 [dict get $firstheader fields x1] if {$opt_warnings_textblock >= 9} { puts $errc "Warning type debug operation {merge blockrow} firstheader $firstheader" } } - set leftright_idx -1 - foreach B $ybuf { - incr leftright_idx - #if {$leftright_idx == 0} { - # set firstheader [lindex $B 1] - #} - #while {[llength $ybuf]} {} - #set B [lpop ybuf 0] + set header_idx -1 + #set leftright_max [expr {[llength $sametop] -1}] + set hdrs [lsearch -all -inline -index 0 $sametop header] + #puts "--- > $hdrs" + set header_max [expr {[llength $hdrs] -1}] + foreach B $sametop { #puts "====>$B" switch -- [lindex $B 0] { header { - set this_y_index [dict get [lindex $B 1] fields y-index] + incr header_idx ;#not necessarily in left-right order! + set this_y_index [dict get [lindex $B 1] fields y-index] + set this_x0 [dict get [lindex $B 1] fields x0] + set this_x1 [dict get [lindex $B 1] fields x1] + set this_y0 [dict get [lindex $B 1] fields y0] + set this_y1 [dict get [lindex $B 1] fields y1] + set this_top_offset [expr {$this_y0 - $base_y}] + set this_bottom_offset [expr {$this_y1 - $base_y}] + set this_block_index [dict get [lindex $B 1] fields blockindex] if {$y_index eq ""} { set y_index $this_y_index } else { @@ -1372,10 +1685,11 @@ tcl::namespace::eval punk::pdf::lib { set lines [lindex $B 1] #review - vertical oriented text at left can stomp on data during merge #we are only given the bbox info by tclMuPDF - if {abs($blockrow_x1 - $blockrow_x0) < 1} { + if {abs($this_x1 - $this_x0) < 1} { #non displayable #todo - emit at document tail as a non-layed out block? if {$opt_warnings_textblock > 0} { + #review - firstheader vs this_block_index? #puts $errc "Warning - no x space. Undisplayed text on page:[dict get $firstheader fields pageindex] block:[dict get $firstheader fields blockindex] text:[join $lines \n]" puts $errc "Warning pageindex [dict get $firstheader fields pageindex] blockindex [dict get $firstheader fields blockindex] type {no x space} msg {Undisplayed text} text \"[join $lines \n]\"" } @@ -1386,38 +1700,95 @@ tcl::namespace::eval punk::pdf::lib { #e.g overtype::textblock::ellipsis $space_available $block set block [join $lines \n] - set bwidth [textblock::widthtopline $block] ;#assume non-ragged block + set numlines [llength $lines] + set bwidth [textblock::width $block] ;#assume ragged block (differing line lengths) set margintop "" - if {$blockrow_offset > 0} { - set offsetlines [expr {int(ceil($blockrow_offset / $x_per_line))}] - set margintop [textblock::block $bwidth $offsetlines " "] + set bottom_offsetlines [expr {int(round($this_bottom_offset / $x_per_line))}] + if {[llength $MB] > 1 && $bottom_offsetlines >= $numlines} { + #can use bottom of textblock dist from top (better line alignment for scenario: "smallprint fieldname:" "bigprint value" aligned at base ???) + set topmarginlines [expr {$bottom_offsetlines - $numlines}] + #puts "====> offset from bottom bottom_offsetlines:$bottom_offsetlines topmarginlines:$topmarginlines this_y1:$this_y1" + if {$topmarginlines > 0} { + set margintop [textblock::block $bwidth $topmarginlines " "] + } + } else { + if {$this_top_offset > 0} { + #puts "====> offset from top" + #set offsetlines [expr {int(ceil($blockrow_top_offset / $x_per_line))}] + set offsetlines [expr {int(round($blockrow_top_offset / $x_per_line))}] ;# + #set offsetlines [expr {int(floor($this_top_offset / $x_per_line))}] ;# + if {$offsetlines > 0} { + set margintop [textblock::block $bwidth $offsetlines " "] + } + } + } + if {$this_block_index ni $rhs_block_indexes} { + #truncate overwidth blocks except for rhs + #overwidth if can collide (or merge) with x0 of any other textblocks in the merge set + #only interested in blocks where x0 >= this x1 + set next_x0_list [list] + foreach x0 $textblock_x0_list { + if {$x0 >= ($this_x1 - $base_x_per_c)} { + lappend next_x0_list $x0 + } + } + set next_x0_list [lsort -real $next_x0_list] + set closest_x0 [lindex $next_x0_list 0] + if {$closest_x0 ne ""} { + #set maxwidthchars [x_to_c $this_x1] + set maxwidthchars [x_to_c $closest_x0] + incr maxwidthchars -1 ;#vertically merging text blocks results in unintended text adjacencies - REVIEW + if {$maxwidthchars < $bwidth} { + #warn of rhs truncation + #text will have 'ellipsis' + #review - source text could already have ellipsis - this makes detecting truncation harder + #for now we will produce a warning if ellipsis is in the result and is on the right hand side + #todo - use a function other than textblock::frame which can report truncation + set block [textblock::frame -type {} -boxlimits {} -width [expr {$maxwidthchars +2}] $block] ;# +2 for vertical frame borders - which are removed but needed for block -width + set is_truncated 0 + foreach ln [split [punk::ansi::ansistrip $block] \n] { + if {[regexp {[\u2026]$} $ln]} { + set is_truncated 1 + break + } + } + if {$is_truncated} { + if {$opt_warnings_textblock > 0} { + puts $errc "Warning pageindex [dict get $firstheader fields pageindex] blockindex [dict get $firstheader fields blockindex] type {truncation} msg {merged textblock truncated on rhs} text \"[join $lines \n]\"" + } + lappend warnings 1 + } + } + } + } + #apply margintop after possible rhs truncation above (no need for ellipsis above content) + if {$margintop ne ""} { + set movedblock $margintop\n$block + } else { + set movedblock $block } + if {$mergedblock eq ""} { #could be image - as we are using overtype - should result in blank area #todo - consider option to set bg colour, then create a block of spaces of the right size - but don't add any chars other than ANSI codes which can be stripped easily. - if {$margintop ne ""} { - set mergedblock $margintop\n$block - } else { - set mergedblock $block - } + set mergedblock $movedblock } else { - if {$margintop ne ""} { - set movedblock $margintop\n$block - } else { - set movedblock $block - } set mergedblock [overtype::block -transparent 1 -overflow 1 $mergedblock $movedblock] } + } } block { set block [lindex $B 1] set bwidth [textblock::widthtopline $block] ;#assume non-ragged block set margintop "" - if {$blockrow_offset > 0} { - set offsetlines [expr {int(ceil($blockrow_offset / $x_per_line))}] - set margintop [textblock::block $bwidth $offsetlines "\uFFFD"] - #puts "=====> offset:$blockrow_offset" + if {$this_top_offset > 0} { + #set offsetlines [expr {int(ceil($this_top_offset / $x_per_line))}] + set offsetlines [expr {int(round($this_top_offset / $x_per_line))}] + if {$offsetlines > 0} { + set margintop [textblock::block $bwidth $offsetlines "\uFFFD"] + } + #puts "=====> offset:$this_top_offset" } if {$mergedblock eq ""} { if {$margintop ne ""} { @@ -1431,6 +1802,7 @@ tcl::namespace::eval punk::pdf::lib { } else { set movedblock $block } + #review - img placeholder can obscure text if above it - we don't have transparency info - so probably not what's wanted set mergedblock [overtype::block -transparent \uFFFD -overflow 1 $mergedblock $movedblock] } } @@ -1443,6 +1815,7 @@ tcl::namespace::eval punk::pdf::lib { lappend headers_structure $yheaders } if {[llength $all_headers] == 1} { + #puts stdout "---> [lindex $all_headers 0 1]" #only one block was in the MERGE_BUFFER set stored_filled_sep [dict get [lindex $all_headers 0] sep] if {$opt_blocksep ne ""} { @@ -1453,9 +1826,41 @@ tcl::namespace::eval punk::pdf::lib { } #all % placeholders filled, but isn't set sep [string map [list $block_warnings] $stored_filled_sep] + #still want to allow emitting empty line when opt_postcompact is true, if opt_blocksep is set to space puts $outc $sep } - puts $outc $mergedblock + #set is_text_free 1 ;#default to disprove + #set headerinfo [lindex $all_headers 0 1] + #set list_merges [dict get $headerinfo type] + #foreach blockrow_merge $list_merges { + # foreach tp $blockrow_merge { + # if {$tp eq "text"} { + # #todo - look for whitespace only and treat as text free + # set is_text_free 0 + # } + # } + #} + set blockresult "" + if {$opt_shrink_textfree_blocks} { + set teststripped [punk::ansi::ansistrip $mergedblock] + if {[string trim $teststripped] ne ""} { + set blockresult $mergedblock + } + } else { + set blockresult $mergedblock + } + set output "" + if {$opt_postcompact} { + foreach ln [split $blockresult \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + puts -nonewline $outc $output + } else { + puts -nonewline $outc $blockresult\n + } + } elseif {[llength $all_headers] > 1} { #more than one block was in the MERGE_BUFFER set merged_minx0 Inf @@ -1489,7 +1894,27 @@ tcl::namespace::eval punk::pdf::lib { dict set map %marker% "[punk::ansi::a bold cyan]MERGEDBLOCK[punk::ansi::a]" puts $outc [string map $map $opt_blocksep] } - puts $outc $mergedblock + if {$opt_shrink_textfree_blocks} { + set teststripped [punk::ansi::ansistrip $mergedblock] + if {[string trim $teststripped] ne ""} { + #puts $outc $mergedblock + set blockresult $mergedblock + } + } else { + #puts $outc $mergedblock + set blockresult $mergedblock + } + set output "" + if {$opt_postcompact} { + foreach ln [split $blockresult \n] { + if {[string trim $ln] ne ""} { + append output $ln \n + } + } + puts -nonewline $outc $output + } else { + puts -nonewline $outc $blockresult\n + } } set MB [list] }