import os,re,socket from glob import glob from time import time from config import directory filename_ip = directory.spam+"data/ip" filename_sites = directory.spam+"data/sites" paraphanalia = ['GET /images/', 'GET /css/', 'GET /js/', '.atom HTTP'] def find(host, site): try: host = socket.gethostbyname(host) except: pass if site in open(filename_sites).read().split('\n'): return True if host not in open(filename_ip).read().split('\n'): # not in last scan, try to see if they are here today import time log = directory.log + time.strftime('%Y%m%d.log') if os.path.exists(log + ".gz"): import gzip file = gzip.open(log + ".gz") elif os.path.exists(log): file=open(log) else: return False line=file.readline() while line: if line.split(' ',1)[0] == host: # if max([line.find(needle) for needle in paraphanalia])<0: continue now=list(time.localtime()) then=map(int,re.compile('(\d\d):(\d\d):(\d\d) ').findall(line)[0]) then=now[:3] + then + now[6:] if (time.mktime(now) - time.mktime(then)) > 3600.0: break line=file.readline() else: return False # now eliminate fakers try: ip=socket.gethostbyname(host) except: return False from glob import glob files = glob(directory.log+'2*.log') files.sort() files.reverse() for file in files: data = open(file) try: for line in data: if line.startswith(ip): if line.find('/js/')>0: break if line.find('/css/')>0: break if line.find('/images/')>0: break if line.find('.atom')>0: break else: continue finally: data.close() break else: return False return True def collect(): post=re.compile(r'(?:
\s
\s)?Posted by ') anchor=re.compile(r'(.*)$') attrs=re.compile(r'(\w+)="(.*?)"') quarter=time()-86400*90 hosts={} sites={} # read 90 days worth of comments for file in glob(directory.data+"*.cmt"): mtime=os.stat(file).st_mtime if mtime>quarter: content=open(file).read() if content.find('Posted')>0: fields=post.split(content) if len(fields)==2: author={} (content, author['name']) = fields fields=anchor.match(author['name']) if fields: (fields,author['name'])=fields.groups() fields=dict(attrs.findall(fields)) if 'href' in fields: sites[fields['href']]=1 if 'title' in fields: hosts[fields['title']]=1 # convert host names to ip addresses ip={} for host in hosts.keys(): try: ip[socket.gethostbyname(host)]=1 except: pass # read the log file for log in glob(directory.log+"*.log"): file=open(log,"r") line=file.readline() while line: ipaddr = line.split(' ',1)[0] if not ipaddr in ip: # if (line.find('GET /images/')<0 and # line.find('GET /css/')<0 and # line.find('GET /js/')<0 and # line.find('.atom HTTP')<0): continue ip[ipaddr]=1 line=file.readline() # write out ip addresses ip=ip.keys() ip.sort() file=open(filename_ip,"w") file.write('\n'.join(ip)) file.close() # write out sites sites=sites.keys() sites.sort() file=open(filename_sites,"w") file.write('\n'.join(sites)) file.close() if __name__ == '__main__': import sys if len(sys.argv)==3: print find(sys.argv[1], sys.argv[2]) else: collect()