# ACTIVESTATE TEAPOT-PKG BEGIN TM -*- tcl -*- # -- Tcl Module # @@ Meta Begin # Package csv 0.7.1 # Meta as::build::date 2010-01-16 # Meta as::origin http://sourceforge.net/projects/tcllib # Meta category CSV processing # Meta description Procedures to handle CSV data. # Meta license BSD # Meta platform tcl # Meta require {Tcl 8.3} # Meta subject package csv queue matrix tcllib # Meta summary csv # @@ Meta End # ACTIVESTATE TEAPOT-PKG BEGIN REQUIREMENTS package require Tcl 8.3 # ACTIVESTATE TEAPOT-PKG END REQUIREMENTS # ACTIVESTATE TEAPOT-PKG BEGIN DECLARE package provide csv 0.7.1 # ACTIVESTATE TEAPOT-PKG END DECLARE # ACTIVESTATE TEAPOT-PKG END TM # csv.tcl -- # # Tcl implementations of CSV reader and writer # # Copyright (c) 2001 by Jeffrey Hobbs # Copyright (c) 2001-2008 by Andreas Kupries # # See the file "license.terms" for information on usage and redistribution # of this file, and for a DISCLAIMER OF ALL WARRANTIES. # # RCS: @(#) $Id: csv.tcl,v 1.26 2008/10/02 22:26:48 andreas_kupries Exp $ package require Tcl 8.3 package provide csv 0.7.1 namespace eval ::csv { namespace export join joinlist read2matrix read2queue report namespace export split split2matrix split2queue writematrix writequeue } # ::csv::join -- # # Takes a list of values and generates a string in CSV format. # # Arguments: # values A list of the values to join # sepChar The separator character, defaults to comma # # Results: # A string containing the values in CSV format. proc ::csv::join {values {sepChar ,} {delChar \"}} { set out "" set sep {} foreach val $values { if {[string match "*\[${delChar}$sepChar\]*" $val]} { append out $sep${delChar}[string map [list $delChar ${delChar}${delChar}] $val]${delChar} } else { append out $sep${val} } set sep $sepChar } return $out } # ::csv::joinlist -- # # Takes a list of lists of values and generates a string in CSV # format. Each item in the list is made into a single CSV # formatted record in the final string, the records being # separated by newlines. # # Arguments: # values A list of the lists of the values to join # sepChar The separator character, defaults to comma # # Results: # A string containing the values in CSV format, the records # separated by newlines. proc ::csv::joinlist {values {sepChar ,} {delChar \"}} { set out "" foreach record $values { # note that this is ::csv::join append out "[join $record $sepChar $delChar]\n" } return $out } # ::csv::joinmatrix -- # # Takes a matrix object following the API specified for the # struct::matrix package. Each row of the matrix is converted # into a single CSV formatted record in the final string, the # records being separated by newlines. # # Arguments: # matrix Matrix object command. # sepChar The separator character, defaults to comma # # Results: # A string containing the values in CSV format, the records # separated by newlines. proc ::csv::joinmatrix {matrix {sepChar ,} {delChar \"}} { return [joinlist [$matrix get rect 0 0 end end] $sepChar $delChar] } # ::csv::iscomplete -- # # A predicate checking if the argument is a complete csv record. # # Arguments # data The (partial) csv record to check. # # Results: # A boolean flag indicating the completeness of the data. The # result is true if the data is complete. proc ::csv::iscomplete {data} { expr {1 - [regexp -all \" $data] % 2} } # ::csv::read2matrix -- # # A wrapper around "Split2matrix" reading CSV formatted # lines from the specified channel and adding it to the given # matrix. # # Arguments: # m The matrix to add the read data too. # chan The channel to read from. # sepChar The separator character, defaults to comma # expand The expansion mode. The default is none # # Results: # A list of the values in 'line'. proc ::csv::read2matrix {args} { # FR #481023 # See 'split2matrix' for the available expansion modes. # Argument syntax: # #2) chan m #3) chan m sepChar #3) -alternate chan m #4) -alternate chan m sepChar #4) chan m sepChar expand #5) -alternate chan m sepChar expand set alternate 0 set sepChar , set expand none switch -exact -- [llength $args] { 2 { foreach {chan m} $args break } 3 { foreach {a b c} $args break if {[string equal $a "-alternate"]} { set alternate 1 set chan $b set m $c } else { set chan $a set m $b set sepChar $c } } 4 { foreach {a b c d} $args break if {[string equal $a "-alternate"]} { set alternate 1 set chan $b set m $c set sepChar $d } else { set chan $a set m $b set sepChar $c set expand $d } } 5 { foreach {a b c d e} $args break if {![string equal $a "-alternate"]} { return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?" } set alternate 1 set chan $b set m $c set sepChar $d set expand $e } 0 - 1 - default { return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?" } } if {[string length $sepChar] < 1} { return -code error "illegal separator character \"$sepChar\", is empty" } elseif {[string length $sepChar] > 1} { return -code error "illegal separator character \"$sepChar\", is a string" } set data "" while {![eof $chan]} { if {[gets $chan line] < 0} {continue} # Why skip empty lines? They may be in data. Except if the # buffer is empty, i.e. we are between records. if {$line == {} && $data == {}} {continue} append data $line if {![iscomplete $data]} { # Odd number of quotes - must have embedded newline append data \n continue } Split2matrix $alternate $m $data $sepChar $expand set data "" } return } # ::csv::read2queue -- # # A wrapper around "::csv::split2queue" reading CSV formatted # lines from the specified channel and adding it to the given # queue. # # Arguments: # q The queue to add the read data too. # chan The channel to read from. # sepChar The separator character, defaults to comma # # Results: # A list of the values in 'line'. proc ::csv::read2queue {args} { # Argument syntax: # #2) chan q #3) chan q sepChar #3) -alternate chan q #4) -alternate chan q sepChar set alternate 0 set sepChar , switch -exact -- [llength $args] { 2 { foreach {chan q} $args break } 3 { foreach {a b c} $args break if {[string equal $a "-alternate"]} { set alternate 1 set chan $b set q $c } else { set chan $a set q $b set sepChar $c } } 4 { foreach {a b c d} $args break if {![string equal $a "-alternate"]} { return -code error "wrong#args: Should be ?-alternate? chan q ?separator?" } set alternate 1 set chan $b set q $c set sepChar $d } 0 - 1 - default { return -code error "wrong#args: Should be ?-alternate? chan q ?separator?" } } if {[string length $sepChar] < 1} { return -code error "illegal separator character \"$sepChar\", is empty" } elseif {[string length $sepChar] > 1} { return -code error "illegal separator character \"$sepChar\", is a string" } set data "" while {![eof $chan]} { if {[gets $chan line] < 0} {continue} # Why skip empty lines? They may be in data. Except if the # buffer is empty, i.e. we are between records. if {$line == {} && $data == {}} {continue} append data $line if {![iscomplete $data]} { # Odd number of quotes - must have embedded newline append data \n continue } $q put [Split $alternate $line $sepChar] set data "" } return } # ::csv::report -- # # A report command which can be used by the matrix methods # "format-via" and "format2chan-via". For the latter this # command delegates the work to "::csv::writematrix". "cmd" is # expected to be either "printmatrix" or # "printmatrix2channel". The channel argument, "chan", has to # be present for the latter and must not be present for the first. # # Arguments: # cmd Either 'printmatrix' or 'printmatrix2channel' # matrix The matrix to format. # args 0 (chan): The channel to write to # # Results: # None for 'printmatrix2channel', else the CSV formatted string. proc ::csv::report {cmd matrix args} { switch -exact -- $cmd { printmatrix { if {[llength $args] > 0} { return -code error "wrong # args:\ ::csv::report printmatrix matrix" } return [joinlist [$matrix get rect 0 0 end end]] } printmatrix2channel { if {[llength $args] != 1} { return -code error "wrong # args:\ ::csv::report printmatrix2channel matrix chan" } writematrix $matrix [lindex $args 0] return "" } default { return -code error "Unknown method $cmd" } } } # ::csv::split -- # # Split a string according to the rules for CSV processing. # This assumes that the string contains a single line of CSVs # # Arguments: # line The string to split # sepChar The separator character, defaults to comma # # Results: # A list of the values in 'line'. proc ::csv::split {args} { # Argument syntax: # # (1) line # (2) line sepChar # (2) -alternate line # (3) -alternate line sepChar # (3) line sepChar delChar # (4) -alternate line sepChar delChar set alternate 0 set sepChar , set delChar \" switch -exact -- [llength $args] { 1 { set line [lindex $args 0] } 2 { foreach {a b} $args break if {[string equal $a "-alternate"]} { set alternate 1 set line $b } else { set line $a set sepChar $b } } 3 { foreach {a b c} $args break if {[string equal $a "-alternate"]} { set alternate 1 set line $b set sepChar $c } else { set line $a set sepChar $b set delChar $c } } 4 { foreach {a b c d} $args break if {![string equal $a "-alternate"]} { return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?" } set alternate 1 set line $b set sepChar $c set delChar $d } 0 - default { return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?" } } if {[string length $sepChar] < 1} { return -code error "illegal separator character ${delChar}$sepChar${delChar}, is empty" } elseif {[string length $sepChar] > 1} { return -code error "illegal separator character ${delChar}$sepChar${delChar}, is a string" } if {[string length $delChar] < 1} { return -code error "illegal separator character \"$delChar\", is empty" } elseif {[string length $delChar] > 1} { return -code error "illegal separator character \"$delChar\", is a string" } return [Split $alternate $line $sepChar $delChar] } proc ::csv::Split {alternate line sepChar {delChar \"}} { # Protect the sepchar from special interpretation by # the regex calls below. set sepRE \\$sepChar set delRE \\$delChar if {$alternate} { # The alternate syntax requires a different parser. # A variation of the string map / regsub parser for the # regular syntax was tried but does not handle embedded # doubled " well (testcase csv-91.3 was 'knownBug', sole # one, still a bug). Now we just tokenize the input into # the primary parts (sep char, "'s and the rest) and then # use an explicitly coded state machine (DFA) to parse # and convert token sequences. ## puts 1->>$line<< set line [string map [list \ $sepChar \0$sepChar\0 \ $delChar \0${delChar}\0 \ ] $line] ## puts 2->>$line<< set line [string map [list \0\0 \0] $line] regsub "^\0" $line {} line regsub "\0$" $line {} line ## puts 3->>$line<< set val "" set res "" set state base ## puts 4->>[::split $line \0] foreach token [::split $line \0] { ## puts "\t*= $state\t>>$token<<" switch -exact -- $state { base { if {[string equal $token "${delChar}"]} { set state qvalue continue } if {[string equal $token $sepChar]} { lappend res $val set val "" continue } append val $token } qvalue { if {[string equal $token "${delChar}"]} { # May end value, may be a doubled " set state endordouble continue } append val $token } endordouble { if {[string equal $token "${delChar}"]} { # Doubled ", append to current value append val ${delChar} set state qvalue continue } # Last " was end of quoted value. Close it. # We expect current as $sepChar lappend res $val set val "" set state base if {[string equal $token $sepChar]} {continue} # Undoubled " in middle of text. Just assume that # remainder is another qvalue. set state qvalue } default { return -code error "Internal error, illegal parsing state" } } } ## puts "/= $state\t>>$val<<" lappend res $val ## puts 5->>$res<< return $res } else { regsub -- "$sepRE${delRE}${delRE}$" $line $sepChar\0${delChar}${delChar}\0 line regsub -- "^${delRE}${delRE}$sepRE" $line \0${delChar}${delChar}\0$sepChar line regsub -all -- {(^${delChar}|${delChar}$)} $line \0 line set line [string map [list \ $sepChar${delChar}${delChar}${delChar} $sepChar\0${delChar} \ ${delChar}${delChar}${delChar}$sepChar ${delChar}\0$sepChar \ ${delChar}${delChar} ${delChar} \ ${delChar} \0 \ ] $line] set end 0 while {[regexp -indices -start $end -- {(\0)[^\0]*(\0)} $line \ -> start end]} { set start [lindex $start 0] set end [lindex $end 0] set range [string range $line $start $end] if {[string first $sepChar $range] >= 0} { set line [string replace $line $start $end \ [string map [list $sepChar \1] $range]] } incr end } set line [string map [list $sepChar \0 \1 $sepChar \0 {} ] $line] return [::split $line \0] } } # ::csv::split2matrix -- # # Split a string according to the rules for CSV processing. # This assumes that the string contains a single line of CSVs. # The resulting list of values is appended to the specified # matrix, as a new row. The code assumes that the matrix provides # the same interface as the queue provided by the 'struct' # module of tcllib, "add row" in particular. # # Arguments: # m The matrix to write the resulting list to. # line The string to split # sepChar The separator character, defaults to comma # expand The expansion mode. The default is none # # Results: # A list of the values in 'line', written to 'q'. proc ::csv::split2matrix {args} { # FR #481023 # Argument syntax: # #2) m line #3) m line sepChar #3) -alternate m line #4) -alternate m line sepChar #4) m line sepChar expand #5) -alternate m line sepChar expand set alternate 0 set sepChar , set expand none switch -exact -- [llength $args] { 2 { foreach {m line} $args break } 3 { foreach {a b c} $args break if {[string equal $a "-alternate"]} { set alternate 1 set m $b set line $c } else { set m $a set line $b set sepChar $c } } 4 { foreach {a b c d} $args break if {[string equal $a "-alternate"]} { set alternate 1 set m $b set line $c set sepChar $d } else { set m $a set line $b set sepChar $c set expand $d } } 4 { foreach {a b c d e} $args break if {![string equal $a "-alternate"]} { return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?" } set alternate 1 set m $b set line $c set sepChar $d set expand $e } 0 - 1 - default { return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?" } } if {[string length $sepChar] < 1} { return -code error "illegal separator character \"$sepChar\", is empty" } elseif {[string length $sepChar] > 1} { return -code error "illegal separator character \"$sepChar\", is a string" } Split2matrix $alternate $m $line $sepChar $expand return } proc ::csv::Split2matrix {alternate m line sepChar expand} { set csv [Split $alternate $line $sepChar] # Expansion modes # - none : default, behaviour of original implementation. # no expansion is done, lines are silently truncated # to the number of columns in the matrix. # # - empty : A matrix without columns is expanded to the number # of columns in the first line added to it. All # following lines are handled as if "mode == none" # was set. # # - auto : Full auto-mode. The matrix is expanded as needed to # hold all columns of all lines. switch -exact -- $expand { none {} empty { if {[$m columns] == 0} { $m add columns [llength $csv] } } auto { if {[$m columns] < [llength $csv]} { $m add columns [expr {[llength $csv] - [$m columns]}] } } } $m add row $csv return } # ::csv::split2queue -- # # Split a string according to the rules for CSV processing. # This assumes that the string contains a single line of CSVs. # The resulting list of values is appended to the specified # queue, as a single item. IOW each item in the queue represents # a single CSV record. The code assumes that the queue provides # the same interface as the queue provided by the 'struct' # module of tcllib, "put" in particular. # # Arguments: # q The queue to write the resulting list to. # line The string to split # sepChar The separator character, defaults to comma # # Results: # A list of the values in 'line', written to 'q'. proc ::csv::split2queue {args} { # Argument syntax: # #2) q line #3) q line sepChar #3) -alternate q line #4) -alternate q line sepChar set alternate 0 set sepChar , switch -exact -- [llength $args] { 2 { foreach {q line} $args break } 3 { foreach {a b c} $args break if {[string equal $a "-alternate"]} { set alternate 1 set q $b set line $c } else { set q $a set line $b set sepChar $c } } 4 { foreach {a b c d} $args break if {![string equal $a "-alternate"]} { return -code error "wrong#args: Should be ?-alternate? q line ?separator?" } set alternate 1 set q $b set line $c set sepChar $d } 0 - 1 - default { return -code error "wrong#args: Should be ?-alternate? q line ?separator?" } } if {[string length $sepChar] < 1} { return -code error "illegal separator character \"$sepChar\", is empty" } elseif {[string length $sepChar] > 1} { return -code error "illegal separator character \"$sepChar\", is a string" } $q put [Split $alternate $line $sepChar] return } # ::csv::writematrix -- # # A wrapper around "::csv::join" taking the rows in a matrix and # writing them as CSV formatted lines into the channel. # # Arguments: # m The matrix to take the data to write from. # chan The channel to write into. # sepChar The separator character, defaults to comma # # Results: # None. proc ::csv::writematrix {m chan {sepChar ,} {delChar \"}} { set n [$m rows] for {set r 0} {$r < $n} {incr r} { puts $chan [join [$m get row $r] $sepChar $delChar] } # Memory intensive alternative: # puts $chan [joinlist [m get rect 0 0 end end] $sepChar $delChar] return } # ::csv::writequeue -- # # A wrapper around "::csv::join" taking the rows in a queue and # writing them as CSV formatted lines into the channel. # # Arguments: # q The queue to take the data to write from. # chan The channel to write into. # sepChar The separator character, defaults to comma # # Results: # None. proc ::csv::writequeue {q chan {sepChar ,} {delChar \"}} { while {[$q size] > 0} { puts $chan [join [$q get] $sepChar $delChar] } # Memory intensive alternative: # puts $chan [joinlist [$q get [$q size]] $sepChar $delChar] return }