import os,re,socket
from glob import glob
from time import time
from config import directory
filename_ip = directory.spam+"data/ip"
filename_sites = directory.spam+"data/sites"
paraphanalia = ['GET /images/', 'GET /css/', 'GET /js/', '.atom HTTP']
def find(host, site):
try:
host = socket.gethostbyname(host)
except:
pass
if site in open(filename_sites).read().split('\n'): return True
if host not in open(filename_ip).read().split('\n'):
# not in last scan, try to see if they are here today
import time
log = directory.log + time.strftime('%Y%m%d.log')
if os.path.exists(log + ".gz"):
import gzip
file = gzip.open(log + ".gz")
elif os.path.exists(log):
file=open(log)
else:
return False
line=file.readline()
while line:
if line.split(' ',1)[0] == host:
# if max([line.find(needle) for needle in paraphanalia])<0: continue
now=list(time.localtime())
then=map(int,re.compile('(\d\d):(\d\d):(\d\d) ').findall(line)[0])
then=now[:3] + then + now[6:]
if (time.mktime(now) - time.mktime(then)) > 3600.0: break
line=file.readline()
else:
return False
# now eliminate fakers
try:
ip=socket.gethostbyname(host)
except:
return False
from glob import glob
files = glob(directory.log+'2*.log')
files.sort()
files.reverse()
for file in files:
data = open(file)
try:
for line in data:
if line.startswith(ip):
if line.find('/js/')>0: break
if line.find('/css/')>0: break
if line.find('/images/')>0: break
if line.find('.atom')>0: break
else:
continue
finally:
data.close()
break
else:
return False
return True
def collect():
post=re.compile(r'(?:
\s
\s)?Posted by ')
anchor=re.compile(r'(.*)$')
attrs=re.compile(r'(\w+)="(.*?)"')
quarter=time()-86400*90
hosts={}
sites={}
# read 90 days worth of comments
for file in glob(directory.data+"*.cmt"):
mtime=os.stat(file).st_mtime
if mtime>quarter:
content=open(file).read()
if content.find('Posted')>0:
fields=post.split(content)
if len(fields)==2:
author={}
(content, author['name']) = fields
fields=anchor.match(author['name'])
if fields:
(fields,author['name'])=fields.groups()
fields=dict(attrs.findall(fields))
if 'href' in fields: sites[fields['href']]=1
if 'title' in fields: hosts[fields['title']]=1
# convert host names to ip addresses
ip={}
for host in hosts.keys():
try:
ip[socket.gethostbyname(host)]=1
except:
pass
# read the log file
for log in glob(directory.log+"*.log"):
file=open(log,"r")
line=file.readline()
while line:
ipaddr = line.split(' ',1)[0]
if not ipaddr in ip:
# if (line.find('GET /images/')<0 and
# line.find('GET /css/')<0 and
# line.find('GET /js/')<0 and
# line.find('.atom HTTP')<0): continue
ip[ipaddr]=1
line=file.readline()
# write out ip addresses
ip=ip.keys()
ip.sort()
file=open(filename_ip,"w")
file.write('\n'.join(ip))
file.close()
# write out sites
sites=sites.keys()
sites.sort()
file=open(filename_sites,"w")
file.write('\n'.join(sites))
file.close()
if __name__ == '__main__':
import sys
if len(sys.argv)==3:
print find(sys.argv[1], sys.argv[2])
else:
collect()