#! /usr/bin/env bash

function usage
{
cat <<EOF

`basename $0` [options] [<columns>]

Extracts the given columns from an ASCII Bro log on standard input.
If no columns are given, all are selected. By default, bro-cut does
not include format header blocks into the output.

Example: cat conn.log | `basename $0` -d ts id.orig_h id.orig_p

    -c       Include the first format header block into the output.
    -C       Include all format header blocks into the output.
    -d       Convert time values into human-readable format (needs gawk).
    -D <fmt> Like -d, but specify format for time (see strftime(3) for syntax).
    -F <ofs> Sets a different output field separator.
    -n       Print all fields *except* those specified.
    -u       Like -d, but print timestamps in UTC instead of local time (needs gawk).
    -U <fmt> Like -D, but print timestamps in UTC instead of local time (needs gawk).

For the time conversion, the format string can also be specified by setting
an environment variable BRO_CUT_TIMEFMT.

EOF
    exit 1
}

if [ "$BRO_CUT_TIMEFMT" != "" ]; then
    timefmt=$BRO_CUT_TIMEFMT
else
    timefmt="%Y-%m-%dT%H:%M:%S%z"
fi

headers=0
substtime=0
utc=0
ofs=""
gnu=0
utc_capable=0
negate=0
awk=awk

# Prefer GNU awk if found so that we can do time conversion.
which gawk >/dev/null 2>&1 && awk=gawk

# Check if awk is GNU awk (even for gawk).
($awk --version 2>&1 | head -1 | grep -q ^GNU) && gnu=1

# Probe for three-arg strftime introduced in gawk 3.1.6
($awk '{print strftime("%Y",1,1)}' /dev/null > /dev/null 2>&1) && utc_capable=1

while getopts "cCdD:F:uU:nh" opt; do
    case "$opt" in
        c) headers=1;;
        C) headers=2;;
        d) substtime=1;;
        D) substtime=1; timefmt=$OPTARG;;
        F) ofs=$OPTARG;;
        u) substtime=1; utc=1;;
        U) substtime=1; utc=1; timefmt=$OPTARG;;
        n) negate=1;;
        *) usage;;
    esac
done

if [ "$substtime" == 1 -a "$gnu" != "1" ]; then
    echo "Options -d/-D/-u/-U only supported with gawk" >&2
    exit 1
fi

if [ "$utc" == 1 -a "$utc_capable" != "1" ]; then
    echo "Options -u/-U require gawk >= 3.1.6" >&2
    exit 1
fi

shift $(($OPTIND - 1))
fields=`echo $@ | sed 's/[ ,] */:/g'`

script='

function error(msg) {
    print "bro-cut error: " msg >"/dev/stderr";
    exit(1);
}

function asc(c) {
    return sprintf("%c", c);
}

function hexdigit(d) {
    digits="0123456789abcdef";
    return index(digits, tolower(d)) - 1;
}

function parseSep(s) {
    # s must be sequence of "\xXX"
    if ( s == "" )
        return "";

    a = substr(s, 3, 1);
    b = substr(s, 4, 1);
    tail = substr(s, s + 5);

    return asc(hexdigit(a) * 16 + hexdigit(b)) parseSep(tail);
}

function printHeader() {
    return headers == 2 || (headers == 1 && first_header);
}

BEGIN {
    first_header = 1;
    len_f = split(fields, f, /:/);
    for ( i = 1; i <= len_f; ++i )
        idx[f[i]] = i;
}

/^#separator/ {
    split($0, s, / /);
    FS = parseSep(s[2]);

    if ( custom_ofs != "" )
        OFS = custom_ofs;
    else
        OFS = FS;

    if ( printHeader() )
        print;

    next;
}

/^#fields/ {
    if ( fields == "" ) {
        # Select all fields.
        for ( i = 2; i <= NF; ++i ) {
            columns[i-1] = i-1;
            len_columns = NF-1;
            }
    }

    else if ( ! negate ) {
        len_columns = len_f;

        for ( i = 2; i <= NF; ++i ) {
            if ( $i in idx )
                columns[idx[$i]] = i-1;
            }

        for ( i = 1; i <= len_f; ++i ) {
            if ( columns[idx[f[i]]] == "" )
                error("unknown field \"" f[i] "\"");
        }
    }

    else {
        # Negate, exclude given fields.
        j = 1;

        for ( i = 2; i <= NF; ++i ) {
            if ( ! ($i in idx) )
                columns[j++] = i-1;
        }

        len_columns = j - 1;
    }
}

/^#types/ {
    for ( i = 2; i <= NF; ++i )
        times[i-1] = ($i == "time");
}

/^#(fields|types)/ && printHeader() {
    printf("%s", $1);
    for ( i = 1; i <= len_columns; ++i ) {
        val = $(int(columns[i]) + 1);

        if ( $1 == "#types" && substtime && times[i] == "1" )
            val = "string";

        printf("%s%s", OFS, val);
        }

    print "";
    next;
}

/^#/ {
    if ( printHeader() )
        print;

    next;
}

{
    first_header = 0;

    for ( i = 1; i <= len_columns; ++i ) {
        j = int(columns[i])
        val = $j

        if ( substtime && times[j] == "1" )
            val = strftime_safe(timefmt, val, utc);

        if ( i > 1 )
            printf("%s%s", OFS, val);
        else
            printf("%s", val);
        }

    print "";
}
'

if [ "$gnu" != "1" ]; then
   # Add a dummy function for awks that don't have it. This will never be called.
   dummy_strftime="function strftime_safe(a,b,c) {}"
else
   # Patch up strftime for the optional UTC argument.
   if [ "$utc_capable" == "1" ]; then
       dummy_strftime="function strftime_safe(a,b,c) {return strftime(a,b,c);}"
   else
       dummy_strftime="function strftime_safe(a,b,c) {return strftime(a,b);}"
   fi
fi

$awk -v fields=$fields -v headers=$headers -v "custom_ofs=$ofs" -v substtime=$substtime -v utc=$utc -v "timefmt=$timefmt" -v "negate=$negate" "$script $dummy_strftime"
