You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
366 lines
13 KiB
366 lines
13 KiB
# |
|
# Copyright (c) 2015, Ashok P. Nadkarni |
|
# All rights reserved. |
|
# |
|
# See the file license.terms for license |
|
# |
|
|
|
namespace eval tclcsv { |
|
variable script_dir |
|
set script_dir [file dirname [info script]] |
|
} |
|
|
|
# Like _sniff above but only considers delimiters |
|
proc tclcsv::_sniff {chan delimiters} { |
|
set seek_pos [chan tell $chan] |
|
if {$seek_pos == -1} { |
|
error "Channel is not seekable" |
|
} |
|
|
|
# TBD - what if delimiters or quotes is empty? |
|
# TBD - what if no rows ? |
|
|
|
set escapes [list \\ ""] |
|
set quotes [list \" '] |
|
set combinations {} |
|
foreach delimiter $delimiters { |
|
unset -nocomplain width_frequencies |
|
try { |
|
set nrows 0 |
|
while {[gets $chan line] >= 0} { |
|
if {$line eq ""} continue |
|
set row [split $line $delimiter] |
|
set n [llength $row] |
|
incr width_frequencies($n) |
|
incr nrows |
|
if {$nrows > 100} break |
|
} |
|
} finally { |
|
chan seek $chan $seek_pos |
|
} |
|
if {![info exists width_frequencies]} continue |
|
set sorted_frequencies [lsort -stride 2 -decreasing -integer -index 1 [array get width_frequencies]] |
|
set mode [lindex $sorted_frequencies 0] |
|
# Fraction of the occurences where the mode frequency occurs |
|
set mode_frac [expr {[lindex $sorted_frequencies 1] / double($nrows)}] |
|
lappend combinations [list $delimiter $mode $mode_frac] |
|
} |
|
|
|
# Sort the candidates such that |
|
# - those where the mode fraction is higher are preferred under |
|
# the assumption that the more lines that have the same number |
|
# of occurences of that character, the greater the likelihood |
|
# that character is the delimiter |
|
# - those where the mode is higher is preferred under the assumption |
|
# that greater occurences of a character within a line are |
|
# less likely to be by chance |
|
set comparator { |
|
{a b} { |
|
lassign $a adelim amode afrac |
|
lassign $b bdelim bmode bfrac |
|
|
|
#return [expr {(sqrt($amode)*$afrac) > (sqrt($bmode)*$bfrac)}] |
|
set aweight [expr {(sqrt($amode)*$afrac)}] |
|
set bweight [expr {(sqrt($bmode)*$bfrac)}] |
|
if {$aweight > $bweight} { |
|
return 1 |
|
} elseif {$aweight < $bweight} { |
|
return -1 |
|
} else { |
|
return 0 |
|
} |
|
} |
|
} |
|
set winner [lindex [lsort -decreasing -command [list apply $comparator] $combinations] 0] |
|
set delimiter [lindex $winner 0] |
|
set nfields [lindex $winner 1] |
|
|
|
# We have picked a delimiter. Now figure out whether |
|
# quotes are in use. By default " is assumed to be the quote char |
|
# If we find sufficient number of fields beginning with and ending |
|
# with ' then we assume that is the quote character. |
|
# Along the way we also check if |
|
# - initial spaces are to be skipped |
|
# - quotes are doubled |
|
# - an escape character is in use |
|
# - comment character |
|
try { |
|
set nrows 0 |
|
while {[gets $chan line] >= 0} { |
|
if {$line eq ""} continue |
|
set row [split $line $delimiter] |
|
set n [llength $row] |
|
if {$n == $nfields} { |
|
lappend good $row |
|
} elseif {$n < $nfields} { |
|
lappend short $row |
|
} else { |
|
lappend long $row |
|
} |
|
incr nrows |
|
if {$nrows > 100} break |
|
} |
|
set ngood [llength $good] |
|
if {$ngood == 0} { |
|
error "Failed to find lines with expected number of fields" |
|
} |
|
foreach row $good { |
|
set findex 0 |
|
set doublequotecount 0 |
|
set singlequotecount 0 |
|
foreach field $row { |
|
# Check for quotes |
|
if {[regexp {^\s*".*"$} $field]} { |
|
incr doublequotecount |
|
} elseif {[regexp {^\s*'.*'$} $field]} { |
|
incr singlequotecount |
|
} |
|
# Keep track of leading spaces per column |
|
if {[string index $field 0] eq " "} { |
|
incr nspaces($findex) |
|
} |
|
incr findex |
|
} |
|
if {$singlequotecount > $doublequotecount} { |
|
set quotechar ' |
|
} else { |
|
# Note even though double quote is the default do not |
|
# explicitly mark as such unless we have actually seen it |
|
if {$doublequotecount > 0} { |
|
set quotechar \" |
|
} |
|
} |
|
} |
|
|
|
# Check if quotes are doubled |
|
if {[info exists quotechar]} { |
|
set ch $quotechar |
|
} else { |
|
set ch \"; # Default quote char |
|
} |
|
|
|
foreach row $good { |
|
foreach field $row { |
|
# TBD - how to check if quotes are doubled? |
|
if {[regexp "\\s*${ch}\[^${ch}\]*${ch}${ch}" $field]} { |
|
incr doublequote |
|
} elseif {[regexp "\\s*${ch}\[^${ch}\]*(\[^\[:alnum:\]\])${ch}" $field -> esc]} { |
|
incr esc_chars($esc) |
|
} |
|
} |
|
if {(![info exists doublequote]) && [info exists esc_chars]} { |
|
set esc_list [lsort -decreasing -integer -stride 2 -index 1 [array get esc_chars]] |
|
set escape [lindex $esc_list 0] |
|
} |
|
} |
|
|
|
# If every column that had a field beginning with a space also |
|
# had all fields in that column beginning with a space then |
|
# we assume leading spaces are to be skipped. |
|
if {[info exists nspaces]} { |
|
set skipleadingspace 1 |
|
foreach {col space_count} [array get nspaces] { |
|
if {$space_count != $ngood} { |
|
set skipleadingspace 0 |
|
break |
|
} |
|
} |
|
} |
|
# If the bulk of short lines begin with the same character, |
|
# assume it is the comment character |
|
if {[info exists short]} { |
|
foreach row $short { |
|
set line [string trim [join $row $delimiter]] |
|
incr comment([string index $line 0]) |
|
} |
|
set nshort [llength $short] |
|
foreach {ch count} [array get comment] { |
|
if {[expr {double($count)/$nshort}] > 0.8} { |
|
set commentchar $ch |
|
break |
|
} |
|
} |
|
} |
|
|
|
# TBD - perhaps long lines can also be used since long lines |
|
# can result from delimiters embedded within quotes |
|
|
|
} finally { |
|
chan seek $chan $seek_pos |
|
} |
|
|
|
set dialect [list -delimiter $delimiter] |
|
if {[info exists skipleadingspace]} { |
|
lappend dialect -skipleadingspace $skipleadingspace |
|
} |
|
if {[info exists quotechar]} { |
|
lappend dialect -quote $quotechar |
|
} |
|
if {[info exists commentchar]} { |
|
lappend dialect -comment $commentchar |
|
} |
|
if {[info exists doublequote]} { |
|
lappend dialect -doublequote 1 |
|
} else { |
|
if {[info exists escape]} { |
|
lappend dialect -escape $escape |
|
} |
|
} |
|
|
|
|
|
return $dialect |
|
} |
|
|
|
proc tclcsv::sniff_header {args} { |
|
if {[llength $args] == 0} { |
|
error "wrong # args: should be \"sniff_header ?options? channel\"" |
|
} |
|
set chan [lindex $args end] |
|
|
|
set seek_pos [chan tell $chan] |
|
if {$seek_pos == -1} { |
|
error "Channel is not seekable" |
|
} |
|
|
|
try { |
|
set rows [csv_read {*}[lrange $args 0 end-1] -nrows 100 $chan] |
|
if {[llength $rows] < 2} { |
|
error "Insufficient rows in CSV data to sniff headers." |
|
} |
|
set width [llength [lindex $rows 0]] |
|
set types {} |
|
for {set findex 0} {$findex < $width} {incr findex} { |
|
dict set types $findex type unknown |
|
dict set types $findex length [string length [lindex $rows 1 $findex]] |
|
} |
|
foreach row [lrange $rows 1 end] { |
|
if {[llength $row] != $width} continue |
|
for {set findex 0} {$findex < $width} {incr findex} { |
|
set val [lindex $row $findex] |
|
if {[string length $val] != [dict get $types $findex length]} { |
|
dict set types $findex length -1 |
|
} |
|
set field_type [dict get $types $findex type] |
|
if {$field_type eq "string"} continue |
|
# Note values starting with 0 treated as strings (eg. zip codes) |
|
# Exceptions are 0 and 0.something |
|
if {[string index $val 0] eq "0" && |
|
[string length $val] > 1 && |
|
[string index $val 1] ne "."} { |
|
dict set types $findex type string |
|
continue |
|
} |
|
if {$field_type eq "real"} { |
|
if {![string is double -strict $val]} { |
|
dict set types $findex type string |
|
} |
|
continue |
|
} |
|
# field_type is integer or unknown. Our check for |
|
# integer is not [string is wide] because we want to |
|
# treat as decimal numbers and not parse as octals or hex |
|
if {[regexp {^\d+$} $val]} { |
|
dict set types $findex type integer |
|
} elseif {[string is double -strict $val]} { |
|
dict set types $findex type real |
|
} else { |
|
dict set types $findex type string |
|
} |
|
} |
|
} |
|
} finally { |
|
chan seek $chan $seek_pos |
|
} |
|
|
|
# If we could determine that any one column was a non-string type |
|
# (integer or real) but the header field for that column is not |
|
# of that type, we immediately conclude the first row is a header. |
|
# In addition, in the case of columns that are of fixed width, |
|
# we take a vote, where every time the field in first row of the |
|
# fixed width column is of a different width we raise the probability |
|
# of header existence and if of the same width, we lower the probability. |
|
set probably_header 0 |
|
set row [lindex $rows 0] |
|
for {set findex 0} {$findex < $width} {incr findex} { |
|
set field [lindex $row $findex] |
|
set type [dict get $types $findex type] |
|
if {($type eq "integer" && ![string is wide -strict $field]) || |
|
($type eq "real" && ![string is double -strict $field]) |
|
} { |
|
# The type of the first row field is different. Assume header |
|
set probably_header 1 |
|
break |
|
} |
|
set len [dict get $types $findex length] |
|
if {$len >= 0} { |
|
if {$len == [string length $field]} { |
|
incr probably_header -1 |
|
} else { |
|
incr probably_header 1 |
|
} |
|
} |
|
} |
|
|
|
set field_types {} |
|
for {set findex 0} {$findex < $width} {incr findex} { |
|
set type [dict get $types $findex type] |
|
# $type can be unknown if the loop above did a continue right |
|
# at the top for every line |
|
if {$type eq "unknown"} { |
|
set type string |
|
} |
|
lappend field_types $type |
|
} |
|
|
|
if {$probably_header > 0} { |
|
return [list $field_types $row] |
|
} else { |
|
return [list $field_types] |
|
} |
|
} |
|
|
|
proc tclcsv::dialect {dialect {direction read}} { |
|
variable dialects |
|
set dialects [dict create] |
|
dict set dialects excel [list \ |
|
-delimiter , \ |
|
-quote \" \ |
|
-doublequote 1 \ |
|
-skipleadingspace 0] |
|
dict set dialects excel-tab [dict merge [dict get $dialects excel] [list -delimiter \t]] |
|
proc [namespace current]::dialect {dialect {direction read}} { |
|
variable dialects |
|
set opts [dict get $dialects $dialect] |
|
if {$direction eq "write"} { |
|
# Remove options not understood by writes |
|
dict unset opts -skipleadingspace |
|
} |
|
return $opts |
|
} |
|
return [dialect $dialect $direction] |
|
} |
|
|
|
proc tclcsv::sniff {args} { |
|
if {[llength $args] == 0} { |
|
error "wrong # args: should be \"sniff ?options? channel\"" |
|
} |
|
set chan [lindex $args end] |
|
|
|
array set opts [dict merge [dict create \ |
|
-delimiters [list "," ";" ":" "\t"] \ |
|
] \ |
|
[lrange $args 0 end-1]] |
|
|
|
return [_sniff $chan $opts(-delimiters)] |
|
} |
|
|
|
proc tclcsv::dialectpicker args { |
|
variable script_dir |
|
uplevel #0 {package require Tk; package require snit} |
|
uplevel #0 [list source [file join $script_dir widgets.tcl]] |
|
uplevel 1 tclcsv::dialectpicker $args |
|
} |
|
|
|
namespace eval tclcsv { |
|
namespace export csv_read csv_write sniff sniff_header dialect |
|
}
|
|
|