Optimized 'archiver' script with clever use of awkmain

Seems to run about 4 times faster now. Which is a decent time save when running the script on the cluster. Uses a magic awk command to concatenate all raw .csv files (excluding their headers). This replaces the old way which consisted of reading each file, trimming the header, and appending the lines to a HUGE variable.
author: Adam M. Stück <adam@adast.xyz> 2022-12-17 15:41:29 +0100
committer: Adam M. Stück <adam@adast.xyz> 2022-12-17 19:26:07 +0100
commit: d1828d9658cb3e17984be757f57224a9e98bce50 (patch)
tree: fdb15fe1d8abf2742e8382792381b48dc1564f9e /archiver
parent: 748bf8ca8e76d7dd48ba744c8a9e405b8ee9de0a (diff)
1 files changed, 13 insertions, 24 deletions
diff --git a/archiver b/archiver
index 58b5bf5..8f78f2b 100755
--- a/archiver
+++ b/archiver
@@ -17,38 +17,29 @@ main() {
         exit 1
     fi
 
-    aggregate_files 
+    if [[ -f "$RESULTS_PATH/data.csv" ]]; then 
+        rm "$RESULTS_PATH/data.csv"
+    fi
+
+    aggregate_files > "$RESULTS_PATH/data.csv"
 }
 
 aggregate_files() {
-    OUTPUT=""
-    HEADER_INSERTED=false
-
-    while IFS= read -r -d '' ENTRY
-    do
-        if [ $HEADER_INSERTED == true ]; then
-            FILE=$(tail -n +2 "$ENTRY")
-        else 
-            FILE=$(cat "$ENTRY")
-        fi
-
-        OUTPUT+="$FILE\n"
-        HEADER_INSERTED=true
-    done <   <(find "$RESULTS_PATH" -maxdepth 1 -name '*.csv' ! -name 'data.csv' -type f -print0)
-
-    HEADER=$(echo -e "$OUTPUT" | head -n 1)
-    ALL_ROWS=$(echo -e "$OUTPUT" | tail -n +2 | sort -t$'\t' -k6,6 -n) 
-    STRATS=$(echo -e "$ALL_ROWS" | awk -F '\t' '{print $5}' | sort | uniq)
+    ALL_ROWS=$(awk FNR-1 "$RESULTS_PATH"/*.csv 2>/dev/null | sort -t$'\t' -k6,6 -n) 
+    STRATS=$(echo "$ALL_ROWS" | awk -F '\t' '{print $5}' | sort | uniq)
 
     OUTPUT=""
     while read -r STRAT; do
         [ -z "$STRAT" ] && continue
-        ROWS=$(echo -e "$ALL_ROWS" | grep -P "\t$STRAT\t")
+        ROWS=$(echo "$ALL_ROWS" | grep -P "\t$STRAT\t")
         OUTPUT+="$ROWS\n"
     done <<< "$STRATS"
 
-    OUTPUT=$(echo -e "$HEADER\n$OUTPUT" | head -n -1)
-    echo -e "$OUTPUT" > "$RESULTS_PATH/data.csv"
+    echo -e "$(header)\n$OUTPUT" | head -n-1
+}
+
+header() {
+    echo -e "model\tquery\tsolved\tresult\tstrategy\ttime\tdate\ttime-limit\tmemory\texit-code\tformula\ttimed-out\terror-msg\tdiscoveredStates\texploredStates\texpandedStates\tmaxTokens\tsearchTime"
 }
 
 help() {
@@ -57,8 +48,6 @@ usage: $0 RESULTS-DIR
 
 Aggregate data from search stragey benchmark
 
-Options:
-  -h, --help            Show this message
 EOF
 }
author	Adam M. Stück <adam@adast.xyz>	2022-12-17 15:41:29 +0100
committer	Adam M. Stück <adam@adast.xyz>	2022-12-17 19:26:07 +0100
commit	d1828d9658cb3e17984be757f57224a9e98bce50 (patch)
tree	fdb15fe1d8abf2742e8382792381b48dc1564f9e /archiver
parent	748bf8ca8e76d7dd48ba744c8a9e405b8ee9de0a (diff)