#!/bin/sh # One key per line in $FIRST_IN_HEADERS to get its first value in the headers. FIRST_VALUE_IN_HEADERS='X-UIDL Return-Path Message-ID Subject Content analysis details Delivery-date' # One pair (name, regexp) per line in $DISTINCT_IN_HEADERS_OR_CONTENT to get the distinct matching strings in the headers and the content. DISTINCT_IN_HEADERS_OR_CONTENT='URL https?://[a-zA-Z0-9./?=_%:-]* IPv4-addr (([0-9]|[0-9]{2}|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[0-9]{2}|1[0-9]{2}|2[0-4][0-9]|25[0-5]) email-addr [a-zA-Z0-9._]+@[a-zA-Z]+.[a-zA-Z]+ email-addr-with-at [a-zA-Z0-9._]+ *\[at\] *[a-zA-Z]+.[a-zA-Z]+' group_by_line () { awk -F : -v i=2 '{ if ($1 - i) { print str while ($1 - ++i) print "" sub(/[^:]*:/, "") str = $0 } else { sub(/[^:]*:/, "") if (str) str = str "," $0 else str = $0 } } END { print str }' } TMP=$(mktemp -d) trap "rm -r $TMP 2>/dev/null" 0 mkdir $TMP/headers $TMP/contents $TMP/fields files=$(seq 2 $(awk -v RS='\nX-Account-Key: ' -F '\n\n' -v tmp=$TMP/ 'NR - 1 { printf substr($1, 1, index($1, "\n")) >> tmp "fields/X-Account-Key" print $1 > tmp "headers/" NR close(tmp "headers/" NR) for (i = 2; i <= NF; ++i) { gsub(/=\n/, "", $i) print $i >> tmp "contents/" NR } close(tmp "contents/" NR) } END { print NR }' "$@")) cd $TMP/headers echo "$FIRST_VALUE_IN_HEADERS" | while read -r key do mkfifo "../fields/$key" grep -im 1 "^ *$key:" $files | sed -e 's/:[^:]*: */:/' -e 's///' | group_by_line > "../fields/$key" & done for part in headers contents do cd ../$part echo "$DISTINCT_IN_HEADERS_OR_CONTENT" | while read -r name regexp do mkfifo "../fields/$name-in-$part" grep -Eo "$regexp" $files | LC_ALL=C sort -ut : -k 1,1n -k 2 | group_by_line > "../fields/$name-in-$part" & done done cd ../fields/ set * echo -n '# ' while [ -n "$2" ] do i=$(expr $i + 1) echo -n "$i:$1\t" shift done echo "$(expr $i + 1):$1" paste *