#!/bin/nawk -f - # # Eliminate entries that are merely prefixes of one another, and entries # that are just one character long. # BEGIN { lastnum = 0; laststr = ""; } { currnum = $1; currstr = $0; gsub(/^[0-9]* '/,"",currstr); gsub(/'$/,"",currstr); # If this one's not an extension of the previous one, emit # the previous one. if(index(currstr,laststr) != 1) { if(length(laststr) > 1) { print (lastnum*length(laststr)) " \'" laststr "\'"; } lastnum = currnum; laststr = currstr; next; } # This one IS an extension of the previous string, so check to see # if it occurs 'interestingly less often', defined here as 15 percent # less often. If so, emit the previous one, otherwise don't bother, # on the grounds that 'most of' the strings which match the previous # guy are really just this guy. if((currnum * 1.15) < lastnum && lastnum != 0) { if(length(laststr) > 1) { print (lastnum*length(laststr)) " \'" laststr "\'"; } lastnum = currnum; laststr = currstr; next; } # Otherwise, the previous one is just a prefix of this one, so # axe it silently. lastnum = currnum; laststr = currstr; } END { if(length(laststr) > 1) { print (lastnum*length(laststr)) " \'" laststr "\'"; } }