diff options
author | Mike Gerwitz <mike.gerwitz@rtspecialty.com> | 2018-10-03 14:21:35 -0400 |
---|---|---|
committer | Mike Gerwitz <mike.gerwitz@rtspecialty.com> | 2018-10-03 14:21:35 -0400 |
commit | 397710c055d26cc7d6a14b2cc804f427a8cf9c57 (patch) | |
tree | 4df9e36a6f42c0c0ba1da7269fb3100882f7929e | |
parent | d251f7a79ba854a73a15d344bdad1627d21153a4 (diff) | |
download | tame-397710c055d26cc7d6a14b2cc804f427a8cf9c57.tar.gz tame-397710c055d26cc7d6a14b2cc804f427a8cf9c57.tar.bz2 tame-397710c055d26cc7d6a14b2cc804f427a8cf9c57.zip |
csvm: Auto-sort expanded output
This will allow the variable abstractions to fully encapsulate values while
still permitting binary searches on sorted rows.
* csvm-expand: Renamed from `csvm2csv'. Add directive support.
* csvm2csv: New script to perform sorting. Invokes aforementioned.
* test/test-csvm2csv: Update for sorting.
-rwxr-xr-x | build-aux/csvm-expand | 195 | ||||
-rwxr-xr-x | build-aux/csvm2csv | 196 | ||||
-rwxr-xr-x | build-aux/test/test-csvm2csv | 66 |
3 files changed, 324 insertions, 133 deletions
diff --git a/build-aux/csvm-expand b/build-aux/csvm-expand new file mode 100755 index 0000000..a231163 --- /dev/null +++ b/build-aux/csvm-expand @@ -0,0 +1,195 @@ +#!/usr/bin/awk -f +# +# Expands a "magic" CSV file into a normal CSV +# +# Copyright (C) 2016, 2018 R-T Specialty, LLC. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# "Magic" CSVs simply exist to make life easier: they permit comments, blank +# lines, variables, sub-delimiter expansion, and any number of ranges per line. +# Ranges will be expanded in every combination, making rate tables highly +# maintainable. +# +# Variables are also supported when defined using :var=val. Variables may +# expand into ranges, 'cause they're awesome. Multiple variables may be +# delimited by semi-colons, as may multiple values. +# +# For example: +# :foo=1--3 +# $foo;7;9--10:$foo, 5--10,1/1/2017 +# +# Would generate: +# 1, 5, 1483246800 +# 1, 6, 1483246800 +# ... +# 5, 10, 1483246800 +# 2, 5, 1483246800 +# ... +# 9, 5, 14832468005 +# ... +# 1, 5, 1483246800 +# 1, 6, 1483246800 +# ... +## + + +# Expand variable with its value, if any +function expand_vars( s, value ) +{ + # attempt to parse variable (may expand into a range) + if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) ) + { + value = vars[ m[1] ]; + + if ( value == "" ) + { + print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr" + exit 1 + } + + return value + } + + return s +} + + +# Expand line +function parseline( i, m, j, me, orig ) +{ + if ( i > NF ) + { + print + return + } + + orig = $i + + # expand variables before any processing so that expansions + # can include any type of formatting + $i = expand_vars( $i ) + + if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) ) + { + cmd = "date --date=" $i " +%s" + cmd |& getline $i + close(cmd) + } + + # check first for delimiters + if ( match( $i, /^([^;]+);(.*)$/, m ) ) + { + # give it a shot with the first value + $i = m[1] + parseline( i ) + + # strip off the first value and process with following value(s) + $i = m[2] + parseline( i ) + + # we've delegated; we're done + $i = orig + return + } + + # parse range + if ( match( $i, /^([^-]+)--([^-]+)$/, m ) ) + { + j = expand_vars( m[1] ) + me = expand_vars( m[2] ) + + if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) ) + { + print "error: invalid range: `" $i "'" > "/dev/stderr" + exit 1 + } + + do + { + $i = j + parseline( i + 1 ) + } while ( j++ < me ) + } + else + { + parseline( i + 1 ); + } + + # restore to original value + $i = orig +} + + +BEGIN { + # we're parsing CSVs + FS = " *, *" + OFS = "," + + has_directives = 0 + directives = "!(NODIRECTIVES)" +} + + +# skip all lines that begin with `#', which denotes a comment, or are empty +/^#|^$/ { next; } + +# directives are echoed back and are intended for processing by +# the parent csvm2csv script +/^!/ && output_started { + print "error: directive must appear before header: `" $0 "'" > "/dev/stderr" + exit 1 +} +/^!/ && has_directives { + print "error: all directives must be on one line: `" $0 "'" > "/dev/stderr" + exit 1 +} +/^!/ { + has_directives = 1 + directives = $0 + + next +} + +# lines that begin with a colon are variable definitions +/^:/ { + if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) ) + { + print "error: invalid variable definition: `" $0 "'" > "/dev/stderr" + exit 1 + } + + vars[ m[1] ] = m[2] + next +} + +# Always begin output with a line for directives, even if there are +# none. This makes subsequent processing much easier, since we won't have +# to conditionally ignore the top line. +!output_started { + print directives + + output_started = 1 +} + +# lines that need any sort of processing (ranges, dates, etc) +/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; } + +# all other lines are normal; simply output them verbatim +{ + # this assignment will ensure that awk processes the output, ensuring that + # extra spaces between commas are stripped + $1=$1 + print +} diff --git a/build-aux/csvm2csv b/build-aux/csvm2csv index addbe26..76b7c46 100755 --- a/build-aux/csvm2csv +++ b/build-aux/csvm2csv @@ -1,8 +1,7 @@ -#!/usr/bin/awk -f -# +#!/bin/bash # Compiles a "magic" CSV file into a normal CSV # -# Copyright (C) 2016, 2018 R-T Specialty, LLC. +# Copyright (C) 2018 R-T Specialty, LLC. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,150 +16,95 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # -# "Magic" CSVs simply exist to make life easier: they permit comments, blank -# lines, variables, sub-delimiter expansion, and any number of ranges per line. -# Ranges will be expanded in every combination, making rate tables highly -# maintainable. -# -# Variables are also supported when defined using :var=val. Variables may -# expand into ranges, 'cause they're awesome. Multiple variables may be -# delimited by semi-colons, as may multiple values. +# For format of CSVMs, see `csvm-expand'. # -# For example: -# :foo=1--3 -# $foo;7;9--10:$foo, 5--10,1/1/2017 -# -# Would generate: -# 1, 5, 1483246800 -# 1, 6, 1483246800 -# ... -# 5, 10, 1483246800 -# 2, 5, 1483246800 -# ... -# 9, 5, 14832468005 -# ... -# 1, 5, 1483246800 -# 1, 6, 1483246800 -# ... +# To disable sorting of CSVM output, use the `!NOSORT' directive before the +# header line. ## +# account for symlinks, since historically this script lives in a different +# directory and has been symlinked for compatibility +declare -r mypath=$( dirname "$( readlink -f "$0" )" ) + -# Expand variable with its value, if any -function expand_vars( s, value ) +# Generate -k arguments for GNU sort given a CSV header +# +# The generated arguments will be of the form -k1,1n ... -kl,ln, where `l' +# is the total number of header entries. +# +# For example, given this header: +# foo, bar, baz +# the output would be: +# -k1,1n -k2,2n -k3,3n +sort-key-args() { - # attempt to parse variable (may expand into a range) - if ( match( s, /^\$([a-zA-Z_-]+)$/, m ) ) - { - value = vars[ m[1] ]; - - if ( value == "" ) - { - print "error: unknown variable reference: `$" m[1] "'" > "/dev/stderr" - exit 1 - } + local -r header="${1?Missing CSV header}" - return value - } + local -i i=0 - return s + # generate -ki,in for each column (notice that a trailing + # comma is added to the header because of the read delimiter) + while read -d,; do + echo -n "-k$((++i)),${i}n " + done <<< "$header," } -# Expand line -function parseline( i, m, j, me, orig ) +# Sort every column of CSV +# +# The columns will all be sorted left-to-right. The header is left in place +# as the first row. +csv-sort() { - if ( i > NF ) - { - print - return - } - - orig = $i - - # expand variables before any processing so that expansions - # can include any type of formatting - $i = expand_vars( $i ) - - if ( match( $i, /^([0-9]+\/){2}[0-9]+$/, m ) ) - { - cmd = "date --date=" $i " +%s" - cmd |& getline $i - close(cmd) - } - - # check first for delimiters - if ( match( $i, /^([^;]+);(.*)$/, m ) ) - { - # give it a shot with the first value - $i = m[1] - parseline( i ) - - # strip off the first value and process with following value(s) - $i = m[2] - parseline( i ) - - # we've delegated; we're done - $i = orig - return - } - - # parse range - if ( match( $i, /^([^-]+)--([^-]+)$/, m ) ) - { - j = expand_vars( m[1] ) - me = expand_vars( m[2] ) - - if ( !match( j, /^[0-9]+$/ ) || !match( me, /^[0-9]+$/ ) ) - { - print "error: invalid range: `" $i "'" > "/dev/stderr" - exit 1 - } + # the first line of the expanded CSVM is the CSV header + local header; read -r header + local -r keys=$( sort-key-args "$header" ) - do - { - $i = j - parseline( i + 1 ) - } while ( j++ < me ) - } - else - { - parseline( i + 1 ); - } - - # restore to original value - $i = orig + # all remaining input (which is now sans header) is sorted + echo "$header" + sort -t, $keys - } -BEGIN { - # we're parsing CSVs - FS = " *, *" - OFS = "," -} +# Output usage information +# +# Kudos to you if you understand the little Easter egg. +usage() +{ + cat <<EOU +Usage: $0 [FILE] +Expand CSVM represented by FILE or stdin into a CSV +The columns of the expanded CSV will be automatically sorted +left-to-right. To inhibit this behavior, use the \`!NOSORT' +directive anywhere before the header line in the source CSVM. -# skip all lines that begin with `#', which denotes a comment, or are empty -/^#|^$/ { next; } +Options: + --help Output usage information. -# lines that begin with a colon are variable definitions -/^:/ { - if ( !match( $0, /^:([a-zA-Z_-]+)=(.*?)$/, m ) ) - { - print "error: invalid variable definition: `" $0 "'" > "/dev/stderr" - exit 1 - } +This program has magic CSV powers. +EOU - vars[ m[1] ] = m[2] - next + exit 64 # EX_USAGE } -# lines that need any sort of processing (ranges, dates, etc) -/--|;|\$[a-zA-Z_-]|\// { parseline( 1 ); next; } -# all other lines are normal; simply output them verbatim +# Sort CSV rows left-to-right unless the `!NOSORT' directive is provided +main() { - # this assignment will ensure that awk processes the output, ensuring that - # extra spaces between commas are stripped - $1=$1 - print + test ! "$1" == --help || usage + + "$mypath/csvm-expand" "$@" \ + | { + local directives; read -r directives + + # ignore sorting if given NOSORT directive + if [[ "$directives" =~ NOSORT ]]; then + cat + else + csv-sort "$sort" + fi + } } + +main "$@" diff --git a/build-aux/test/test-csvm2csv b/build-aux/test/test-csvm2csv index 14ef407..ea6b6db 100755 --- a/build-aux/test/test-csvm2csv +++ b/build-aux/test/test-csvm2csv @@ -38,7 +38,10 @@ run-test() test $? -eq 0 || return 1 # expected output - diff <( cat <<< "$expected" ) <( cat <<< "$given" ) + diff <( cat <<< "$expected" ) <( cat <<< "$given" ) || { + echo "test $testsum failure" >&2 + return 1 + } } @@ -92,11 +95,11 @@ test-delim() declare -r expected='header,line 1,2 +3,6 +3,9 4,2 4,6 -4,9 -3,6 -3,9' +4,9' run-test "$input" "$expected" } @@ -179,11 +182,12 @@ test-var-with-var() :baz=$range;$foo $baz, 5' + # note that the output is sorted declare -r expected='header,line 2,5 +2,5 3,5 -4,5 -2,5' +4,5' run-test "$input" "$expected" } @@ -203,6 +207,51 @@ $foo' } +test-directive-stripped() +{ + declare -r input='!DIRECTIVE +header, line' + + declare -r expected='header,line' + + run-test "$input" "$expected" +} + + +test-no-sort() +{ + declare -r input='!NOSORT +header, line +1,1 +0,0' + + declare -r expected='header,line +1,1 +0,0' + + run-test "$input" "$expected" +} + + +# all directives should be put on a single line +test-fail-multi-directive() +{ + declare -r input='!DIRECTIVE1 +!DIRECTIVE2 +header, line' + + ((testsum++)) + + local -r result=$( + ../csvm2csv 2>&1 <<< "$input" \ + && echo '(test failure: expected failure)' + ) + + grep -q '!DIRECTIVE2' <<< "$result" \ + || return 1 +} + + test-fail-unknown-var-ref() { ((testsum++)) @@ -254,6 +303,9 @@ test-comment \ && test-var-with-range-delim \ && test-var-with-var \ && test-var-zero-ref \ + && test-directive-stripped \ + && test-no-sort \ + && test-fail-multi-directive \ && test-fail-unknown-var-ref \ && test-fail-non-numeric-range \ && test-fail-invalid-var-dfn \ @@ -263,7 +315,7 @@ test-comment \ } # safety check -test "$testsum" -eq 12 || { +test "$testsum" -eq 15 || { echo 'error: did not run all csvm2csv tests!' >&2 exit 1 } |