#!/bin/awk -f ### whois lookup # takes whois server and key # returns result of lookup function whois(server,key ,command,res) { command = whoisbin " -h " server " -- " key; while ( command |& getline ) { res = res $0 "\n"; } close(command); return res; } ### dig up contact info on IP/host # takes hostname/IP, array to store contacts in # returns function contactinfo(str,contacts ,out,host,ip,i,k,l) { # is it an ip? if so try reverse dns if ( !match(str,ipstrip) ) { host = dns(str,"PTR"); ip = str; } else { host = str; ip = dns(str,"A"); } ## we're not really interested in contact for domains ## cause 9/10 they'll be the spammers. #if (host) { # for (srv in hostwhoissrv) { # #print "try for host: " host " @ " hostwhoissrv[srv]; # out[i++] = whois(hostwhoissrv[srv],host); # } #} if (ip) { for (srv in ipwhoissrv) { #print "try for ip: " ip " @ " ipwhoissrv[srv]; out[i++] = whois(ipwhoissrv[srv],ip); } } # find email strings and save them # NB: need to have good whois sources in order to just blindly # pick out email addresses from the output for (i in out) { matchlines(out[i],emailregex,results); for (k in results) { contacts[++l] = results[k]; } } delete results; delete out; uniqarray(contacts); return; } ### search given string and ### collate array of string that match given regex # takes str - a multiline string # regex - regex to match on in str # res - array to store results in function matchlines(str,regex,res ,len,line,rest,i,j,k,temp) { len = length(str); if (!len) { return res; } while (i < len) { rest = substr(str,i); line[++j] = substr(rest,1,index(rest,"\n")); i += length(line[j]); gsub("\n","",line[j]); if ( match(line[j],regex,temp) ) { res[++k] = temp[0]; } } delete line; delete temp; return; } ### lookup dns # takes str a hostname or IP, dns resource record type # return record or null function dns(str,rr ,command,res) { command = digbin " +short"; if ( rr == "PTR" ) command = command " -x"; command = command " " str; while (command |& getline) { res = res $0 "\n"; } close(command); gsub("\n","",res); return res; } ### decode %XX and =XX encoded hex to ASCII # takes a string 'str' # returns decoded string function decode_url (str ,hextab,i,c,c1,c2,len,code,decoded) { # hex to dec lookup table hextab ["0"] = 0; hextab ["8"] = 8; hextab ["1"] = 1; hextab ["9"] = 9; hextab ["2"] = 2; hextab ["A"] = 10; hextab ["3"] = 3; hextab ["B"] = 11; hextab ["4"] = 4; hextab ["C"] = 12; hextab ["5"] = 5; hextab ["D"] = 13; hextab ["6"] = 6; hextab ["E"] = 14; hextab ["7"] = 7; hextab ["F"] = 15; decoded = ""; i = 1; len = length (str); while ( i <= len ) { c = substr (str, i, 1); # check for usual start of URI hex encoding chars if ( c == "%" || c == "=" ) { if ( i+2 <= len ) { # valid hex encoding? c1 = toupper(substr(str, i+1, 1)); c2 = toupper(substr(str, i+2, 1)); if ( !(hextab [c1] == "" && hextab [c2] == "") ) { code = 0 + hextab [c1] * 16 + hextab [c2] + 0 c = sprintf ("%c", code) i = i + 2 } } # + is space } else if ( c == "+" ) { c = " " } decoded = decoded c; ++i; } return decoded } ### try find a string that looks like a host name or IP address ### from given URL string # takes 'str' a URL string within to search # returns possible host string (or empty string) function defangurl (str ,host,h,s,sp,p) { # demunge hex encoding s = decode_url(str); # plain URL? if (s ~ plain_urlregex ) { h=s; # @ munged URL? } else if ( s ~ atmunge_urlregex ) { p = split(s,sp,atmunge_urlsplit); s = sp[p]; delete sp; # javascript cuteness } else if ( s ~ javaregex ) { p = match(s,httpregex); h = substr(s,p); #print "got from java: " host[0]; # hmm.. any http there? } else if (s ~ httpregex ) { h = s; #print "hmm.. got: " host[0]; } # strip out chars that can not be found in host addresses/names #gsub(hstrip,"", host[0]); return mns(h,hregex,hstrip); } ### match and strip ## # match first occurance of regex in string, strip out stripex # returns resulting string or null string ### function mns(string,regex,stripex ,str) { if ( match(string,regex,str) ) { gsub(stripex,"",str[0]); return str[0]; } else { return; } } #### sort and uniq given array # takes array "array" # returns sorted array #### function uniqarray (array ,num,i) { num = asort(array); # delete dupes for (i=1 ; i < num ; i++) { if (array[i] == array[i+1]) { delete array[i]; } } #get rid of empty array elements return asort(array); } BEGIN { ## flags # currently processing headers header = 1; # processing body body = 0; # continued header contheader = 0; # command paths whoisbin = "/usr/bin/whois"; digbin = "/usr/bin/dig"; ### whois servers # to use for IP addresses ipwhoissrvlist = "hibernia.jakma.org whois.cyberabuse.org "; # for host names only hostwhoissrvlist = ipwhoissrvlist "whois.abuse.net "; # generate into array split(ipwhoissrvlist,ipwhoissrv); split(hostwhoissrvlist,hostwhoissrv); ### ips to ignore for relay checking ignore = "127 10 192.168 212.17 64.39.15.246 193.111.82"; ignore = ignore " 193.120.224 193.120.242.226 64.158.222.226"; ignore = ignore " 194.125.22.1 131.211.28.48"; # generate 172.16/12 and tack on to ignore for ( i=16 ; i < 32 ; i++ ) { ignore = ignore " 172." i; } ### #generate array split (ignore, ignores); # javascript javaregex = "([Oo]n[mM]ouse|[wW]indow\\.status)"; ## regexes to match various headers # regex to match received lines rcvdregex = "^(X-)?(Received)"; ## regex to match ip address # match IP address string ipchars = "0-9\." ipregex = "(([0-9]){1,3}\\.){3}([0-9]){1,3}"; # match smtp Received header, received from IP iprcvdregex = "\\[" ipregex "\\]"; # non ip chars ipstrip = "[^" ipchars "]"; ## regex for hostnames # valid hostname chars hchars = "a-zA-Z0-9\\-"; # valid FQDN chars domchars = hchars "\\."; # hostname regex - at least one . hnregex = "(([" hchars "])+\\.)+([" hchars "])+"; # non hostname chars hnstrip = "[^a-zA-Z\\-0-9\\.]"; ## valid host address/name regex hregex = "(" ipregex "|" hnregex ")"; # and not hstrip = hnstrip; ## email regexes # mailbox regexs - least, will catch most normail mailbox names mailboxchars = hchars "!#%\\+\\./_"; mailboxregex = "[" mailboxchars "]+"; emailregex = mailboxregex "@" hregex; emailstrip = "[^" mailboxchars "@]"; ##regexs to match/strip URLs # match http(s) URI httpregex = "[hH][Tt][Tt][Pp]([sS])?://"; # match plain http(s) URL plain_urlregex = httpregex "[" domchars "]+(:[0-9]+)?([/\"\\>\\<]|$)"; # chars to strip out of plain URL plain_urlstrip = "(^" httpregex "|[^0-9a-zA-Z\.-])"; # match an @ munged http(s) URL atmunge_urlregex = httpregex "[" domchars "\:]+.*\@([" domchars "])+(:[0-9]+)?([/\"\>\<]|$)"; # split regex for @ munged addresses atmunge_urlsplit = "[\@]"; } # catch each line of spam for later use # spot end of header / start of body / continued header # and set flags accordingly { #mail = mail $0 "\n"; contheader=0; numlines++; # flag end of headers / begin body if ($0 ~ /^$/ ) { body = 1; header = 0; } # flag continued headers if ( header && ($0 ~ /^[[:space:]]+/) ) { contheader = 1; } } # snarf headers header == 1 { headertxt = headertxt $0 "\n"; if (contheader == 0) { currheader=$1; } } # process SMTP (X-)Received headers header == 1 && currheader ~ rcvdregex { if ( contheader == 0 ) { rcvdtxt = rcvdtxt "\n" $0; } else { rcvdtxt = rcvdtxt $0; } for ( i = NF; i > 0 ; i--) { if ( $i ~ iprcvdregex) { #match($i, iprcvdregex,ipm); #gsub(ipstrip,"",ipm[0]); if ( ipm = mns($i,iprcvdregex,ipstrip) ) { mailips[mailipsix++] = ipm; } } } } # process any lines with URLs in them $0 ~ httpregex { for (i = NF; i > 0 ; i--) { if ( $i ~ httpregex ) { httpstrs[httpstrsix++] = $i; } } } # process body lines with possible email addresses body == 1 && $0 ~ emailregex { for (i = NF; i > 0 ; i--) { if ( $i ~ emailregex ) { emailstrs[emailstrsix++] = $i; } } } END { ## sort SMTP server IP list and delete any we wish to ignore # print "Header text is:\n" headertxt; # print "(X-)Received text:\n" rcvdtxt; num = uniqarray(mailips); for (i=1 ; i != num+1 ; i++) { if (!mailips[i]) continue; nocheck=0; for ( ig in ignores) { regex = sprintf("^%s", ignores[ig]); if (mailips[i] ~ regex ) { nocheck=1; break; } } if ( !nocheck) { print "i " i " mailips: " mailips[i]; #print "/usr/bin/rbcheck " mailips[i] | "/bin/sh"; #close("/bin/sh"); } } ## sort through possible HTTP hosts/addresses strings num = uniqarray(httpstrs); i=0; # is there actually anything there? for (h = 1; h != num+1; h++) { host = defangurl(httpstrs[h]); #print "find: " httpstrs[h]; if (host) { hosts[i++] = host; } } delete httpstrs; # sort through our array of hosts num = uniqarray(hosts); for (h=1 ; h != num+1; h++) { if (!hosts[h]) continue; print "h " h " " hosts[h]; contactinfo(hosts[h],lookup); for (l in lookup) { print lookup[l]; } } ## sort through email addresses num = uniqarray(emailstrs); for (i=1; i != num+1; i++) { if (!emailstrs[i]) continue; if (emailstrs[i] ~ httpregex) continue; em = mns(emailstrs[i],emailregex,emailstrip); print "e: " em; } }