
(sPMc           @   sf  d  Z  d d l Z d d l Z d d l Z d d l Z d d l Z d d l m Z d d l Z d d l	 Z	 d d l
 Z
 d d l Z d d l Z d d l Z d d l Z d d l m Z y d d l m Z Wn d d l m Z n Xe j d  Z e j d  Z e j d	  Z e j d
  Z e a d   Z d d  Z d   Z d   Z d   Z e  d  Z! d S(   s   
Fetch either a single feed, or a set of feeds, normalize to Atom and XHTML,
and write each as a set of entries in a cache directory.
iN(   t   minidom(   t   StringIO(   t   md5(   t   news   ^\w+:/*(\w+:|www\.)?s   [?/:|]+s   ^[,.]*s   [,.]*$c         C   ss  yO t  j |  rN t | t  r< | j d  j d  } qN | j d  } n  Wn n Xt | t  rz | j d  } n  t  j d |  } t j d |  } t	 j d |  } t
 j d |  } t |  d k r`| j d  } xz t t |  d d  D]] } t d j | |    d k  r d j | |   d t d j | |   j   } Pq q Wn  t j j |  |  S(	   s   Return a filename suitable for the cache.

    Strips dangerous and common characters to create a filename we
    can use to store the cache in.
    s   utf-8t   idnat    t   ,i   i    ii   (   t   re_url_schemet   matcht
   isinstancet   strt   decodet   encodet   unicodet   subt   re_slasht   re_initial_cruftt   re_final_cruftt   lent   splitt   ranget   joinR   t	   hexdigestt   ost   path(   t	   directoryt   filenamet   partst   i(    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyR      s*     c         C   sI   t  | d  } | j |   | j   | rE t j | | | f  n  d S(   s     write the document out to disk t   wN(   t   opent   writet   closeR   t   utime(   t   xdoct   outt   mtimet   file(    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyR   :   s
    
 c         C   s   t  j  |   } | d d k S(   Ni    t   httpt   https(   s   https   https(   t   urlparse(   t   urit   parsed(    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyt   _is_http_uriA   s    c         C   s  t  j } t j   } t j   } | j d  s | j d  r` t | j  d k r` d | _ q | j	 r | j
 j j j   d k r d | _ q d | _ n  t j t j   d t j |    } | j d k ro| j d	  ro| j | j d
 <| j d  r4t | j  d k r4| j d |   d | j d <qQ|  | j k rV| j d |   qQ| j d |  | j  n| j d k r| j d  rt | j  d k r| j d |  | j  | j | j d
 <n| j d k r| j d	  r| j | j d
 <|  | j k r| j d |   n | j d |  | j  | j j d  s| j j d  r| j j } t j |  | k r}d  SqqQ| j j j d  rd  S| j j j d  rQ| j j j d  rQ| j d =qQn| | j d k r| j d |   nZ | j d k r| j d |   n8 | j d k rA| j d | j |   n | j d |   | j d  r| j d  r| j | _ | j j d d  d k | _	 | j j d   | _ n  t | j  | j d! <| j d"  r| j d#  r| j r| j | j d$ <n6 | j j d#  r8| j d# r8| j d# | j d$ <n  | j j d%  ra| j d% | j d& <n4 | j d'  r| j  rt j! | j   | j d& <n  | j j d(  r| j d( | j d) <qn  | j d  r| j j d*  st"   | j d* <n  d+ } | j j d,  rd- } n  | j dO k r.d0 } n  xe | j j# D]# }	 |	 j$ d1 k r;| |	 d2 <Pq;q;W| j j# j% t j& i d1 d3 6| d2 6|  d4 6  n  x4 t j' |   j(   D] \ }
 } | | j d5 |
 <qWt) j) |  |  d6 d7 l  m* } t+ d  k r| j-   a+ n  i  } x | j D] } | j d8  s8| j. rQt/ j. d  |  | d8 <n. t0 | d8 d9  r| d8 j1   d | d8 <n  | d8 sqn  d: } | j d;  r| j2 } n  | j d<  r| j3 } n  | | j | j. dP  d k r| | f | | j. <qqWt j4   } xl| j1   D]^\ } } t5 | | j.  } t6 j7 j8 |  rSqn  t5 | | j.  } d  } | j d=  s| d= r| j d> d   | d= <n  | j d=  ry t9 j: | j;  } WqqXn  | s2y t6 j< |  j= } Wq2| j j d=  r/y t9 j: | j j;  } Wq+q+Xq/q2Xn  | sGt j   } n  t j |  | d= <t/ j/ | |  } | j>   j? d?  } | j@   x9 t jA |   D]( } tB jC | | d@ dA } | sPqqW| st6 j7 j8 |  rt6 jD |  qqn  tE | | |  t+ d  k r| j j d8 | j j dB d    } | r{	tF |  tG k r_	| j? d?  } n  | t+ t5 d: | j.  <q{	qqWt+ r	t+ jH   n  t j |   rv
g  | j D] } | j d=  r	| j; ^ q	} | jI   | r	t jJ dC | d6  | j d <n- | j j d  r,
t j | j j  g } n  | sC
| d6 | k  rv
dD t j |   } | j |  | | j d <qv
n  | j dE k r
| j j d  r
| j d =n  | j j d  r| j d | j d <qn | j dF k r
dG | j d <n | j dH k rdI | j d <n | j d k r-dJ | j d <nd | j d k rLdK | j d <nE | j d k rkdL | j d <n& | j d k rdM | j | j d <n  t6 j7 j8 |  st6 jK |  n  tL jM dN t  jN  } t/ jO | jP | j | j	 | j  tE | j>   j? d?  t5 | |    | j@   d  S(Q   Nt   statust   entriesi    i   t   timeouti  i  iQ t   urlt   planet_http_locations
   No data %ss   no datat   planet_messages   Updating feed %ss   Updating feed %s @ %si-  s    Feed has moved from <%s> to <%s>i0  s   Feed %s unchangeds   Feed %s unchanged @ %st   planet_updateds   no activity int	   duplicatei  s   Feed %s gones   Feed %s timed outi  s   Error %d while updating feed %st   versiont   planet_bozot   truet   planet_formatt   planet_http_statust   headerst   etagt   planet_http_etags   last-modifiedt   planet_http_last_modifiedt   modifieds   -content-hasht   planet_content_hasht   linkss   application/atom+xmlt   rsss   application/rss+xmlt   rss090t   rss10s   application/rdf+xmlt   selft   typet   relt   hreft   planet_i(   t   idindext   idt   valuesR   t	   publishedt   updatedt   updated_parsedt   published_parseds   utf-8t   modet   filtert   links   %Y-%m-%dT%H:%M:%SZs   no activity in %d daysi   i  s   403: forbiddeni  s   404: not founds   408: request timeouts	   410: gones   internal server errors   http status %ssD   <feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>
(   s   rss090s   rss10(   R   (Q   t   planett   loggert   configt   cache_sources_directoryt   cache_blacklist_directoryt   has_keyR   R-   R,   t   bozot   bozo_exceptiont	   __class__t   __name__t   lowert   timet   gmtimet   activity_thresholdR/   t   feedt   warningt   infoR2   t
   feedparsert   _parse_date_iso8601R1   t
   startswitht   errort   getR4   R
   R:   R9   R=   t   asctimet   listR?   RE   t   appendt   FeedParserDictt   feed_optionst   itemst   scrubRH   t   indext   NoneR   RI   t   reconstitutet   hasattrRJ   RK   RL   t   cache_directoryR   R   R   t   existst   calendart   timegmRM   t   statt   st_mtimet   toxmlR   t   unlinkt   filterst   shellt   runt   removeR   RD   R   R    t   sortt   strftimet   makedirsR    t   parseStringt   xmlnst   sourcet   documentElement(   t   feed_urit	   feed_infot   datat   logt   sourcest	   blacklistt   activity_horizonRL   t   feedtypeRQ   t   namet   valueRH   t   idst   entryt   cachet   blacklist_filet
   cache_fileR$   R"   t   outputRP   t   feedidt   msg(    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyt
   writeCacheE   sb   	$	&$3
  	 	
"" 
    
  $   
   "%c      	   C   s  d d  l  } d d l m } | j t j    } | j d t  \ } } x| r| j d | |   t	 d  }	 t
 |	 d |  t
 |	 d t j i d d	 6  yy_ t | t  r | j d
  }
 n | j d  j d
  }
 |
 | k r| j d | |
  n  Wn | j d |  | }
 n Xi  } | j j d  rO| j d | d <n  | j j d  ru| j d | d <n  | j |
 d d | \ } } t | pd  j   | d <| j d k r| j rd | _ q| j j d  r| j d | d k rd | _ qn  t	 |  }	 t
 |	 d | j d |   | j d  rL| d =n  t
 |	 d |  WnD| k
 r| j d | |   n!| j k
 r} | j d t |  |   n t j k
 r} | j j j   d k rd |	 j d	 <| j  d |   q| j d t |  |   n t! k
 r} d d  l" } d d  l# } | j$   \ } } } | j d |  x> | j% | |  | j& |  D] } | j | j'    qWn X| j( d t d  | | |	 f  | j d t  \ } } qL Wd  S(!   Ni(   t   BadStatusLinet   blocks   Fetching %s via %dR   R/   R9   t   500R,   R   s   utf-8s   IRI %s mapped to %ss   unable to map %s to a URIR;   s   If-None-MatchR<   s   If-Modified-Sincet   GETs   -content-hashi   i0  R>   s   content-locations   content-encodings&   Bad Status Line received for %s via %ds   HttpLib2Error: %s via %dR.   t   408s   Timeout in thread-%ds   HTTP Error: %s in thread-%ds   Error processing %st   item()   t   httplib2t   httplibR   t   HttpRT   t   http_cache_directoryRg   t   TrueRb   R   t   setattrRc   Rk   R	   R   R   R   R`   RW   t   requestR   R   R,   t	   fromcacheRf   t   HttpLib2ErrorR
   t   socketRZ   R[   R\   R9   t   warnt	   Exceptiont   syst	   tracebackt   exc_infot   format_exception_onlyt	   format_tbt   rstript   put(   t   thread_indext   input_queuet   output_queueR   R   R   t   hR)   R   R`   R   R9   t   respt   contentt   eR   R   RD   R   t   tbt   line(    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyt
   httpThread%  st    		 
	

	c         C   s  t  j } t a t j   } y' t j t |   | j	 d |  WnT y3 d d l
 } | j t |   | j	 d |  Wq | j d |  q Xn Xd d l m } d d l m } |   } |   } i  } t j   }	 |	 rt j j |	  rt j |	  n  t t j    rqxc t t t j     D]9 }
 | d t d |
 | | | f  | |
 <| |
 j   q1Wn | j	 d	  x t j   D] } t j   } t | |  } t j |  } | j r|  r| j	 d
 |  qn  | j j  d d  d k r| j	 d |  qn  | r<t" |  r<| j# d | | f  q| j# d | | | f  qWx$ | j$   D] } | j# d d#  qfWi  } x6| j%   s| j%   s| rx| j%   rf| j  t&  \ } } } yt' | d  st | j( j)  d k  rki  } t' | d  rV| j j  d d  | d <y" t* j+ | j j  d d   } WqVqVXn  t j | |  } nR t j, i d d 6| j( d 6g  d 6i  d 6| j- d 6d d 6t | j( j)  d 6 } | j j  d d  } | s| j j  d d  } n  | } | j. d  r| j/ } n  d } | r2| | k r2| } n | rM| | k rM| } n  | rd | | | j d <| j0 d | | | f  | r| | j d <qn  | r| | | <n  | r| | | <n  t1 | | |  Wqt2 k
 rb} d d l3 } d d l4 } | j5   \ } } } | j6 d  |  x> | j7 | |  | j8 |  D] } | j6 | j9    qBWqXqWt* j: d!  xD | j$   D]6 a | t j;   s| t =| s| j	 d"  qqqWqWd S($   s!    Spider (fetch) an entire planet s    Socket timeout set to %d secondsiNs+   Timeout set to invalid value '%s', skipping(   t   Queue(   t   Threadt   targett   argss   Building work queues   Feed %s already in cacheR8   t   410s   Feed %s goneR   R9   i,  R`   R;   R:   R<   R4   R-   RF   i    RX   R,   RI   s   duplicate subscription: R1   s!   Duplicate subscription: %s and %sR0   s   Error processing %sg?s%   Finished threaded part of processing.(   NN(<   RR   RS   R   Ro   RT   t   feed_timeoutR   t   setdefaulttimeoutt   floatRb   t   timeoutsockett   setDefaultSocketTimeoutRa   R   t	   threadingR   R   R   R   Rt   R   t   intt   spider_threadsR   R   t   startt   subscriptionsRU   R   Rc   t   parseR`   Rg   Rp   R+   R   t   keyst   qsizet   FalseRr   R9   R,   R]   t   strptimeRk   R/   RW   RF   R   R   R   R   R   R   Rf   R   R   R   t   sleept   isAlive(   t   only_if_newR   R.   R   R   R   t   fetch_queuet   parse_queuet   threadst
   http_cacheR   R)   R   t   feed_sourceR   t   threadt
   feeds_seenR`   t   optionsR=   R   RI   RF   R3   R   R   R   RD   R   R   R   (    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyt   spiderPlanetn  s    			!(  			   ("   t   __doc__R]   Ru   t   reR   R(   t   xml.domR    RR   RT   Rc   Rq   R|   R   Rn   R   t   hashlibR   R   t   compileR   R   R   R   R   Ro   R   Rp   R   R+   R   R   R   R   (    (    (    s:   /home/sa3ruby/intertwingly.net/code/venus/planet/spider.pyt   <module>   s&   <T	 			I