
Jc           @   sf  d  Z  d d l Z d d l Z d d l Z d d l Z d d l Z d d l m Z d d l Z d d l	 Z	 d d l
 Z
 d d l Z d d l Z d d l Z d d l Z d d l m Z y d d l m Z Wn d d l m Z n Xe j d  Z e j d  Z e j d	  Z e j d
  Z e a d   Z d d  Z d   Z d   Z d   Z e  d  Z! d S(   s   
Fetch either a single feed, or a set of feeds, normalize to Atom and XHTML,
and write each as a set of entries in a cache directory.
iN(   t   minidom(   t   StringIO(   t   md5(   t   news   ^\w+:/*(\w+:|www\.)?s   [?/:|]+s   ^[,.]*s   [,.]*$c         C   ss  yO t  j |  rN t | t  r< | j d  j d  } qN | j d  } n  Wn n Xt | t  rz | j d  } n  t  j d |  } t j d |  } t	 j d |  } t
 j d |  } t |  d k r`| j d  } xz t t |  d d  D]] } t d j | |    d k  r d j | |   d t d j | |   j   } Pq q Wn  t j j |  |  S(	   s   Return a filename suitable for the cache.

    Strips dangerous and common characters to create a filename we
    can use to store the cache in.
    s   utf-8t   idnat    t   ,i   i    ii   (   t   re_url_schemet   matcht
   isinstancet   strt   decodet   encodet   unicodet   subt   re_slasht   re_initial_cruftt   re_final_cruftt   lent   splitt   ranget   joinR   t	   hexdigestt   ost   path(   t	   directoryt   filenamet   partst   i(    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyR      s*     c         C   sI   t  | d  } | j |   | j   | rE t j | | | f  n  d S(   s     write the document out to disk t   wN(   t   opent   writet   closeR   t   utime(   t   xdoct   outt   mtimet   file(    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyR   :   s
    
 c         C   s   t  j  |   } | d d k S(   Ni    t   httpt   https(   s   https   https(   t   urlparse(   t   urit   parsed(    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyt   _is_http_uriA   s    c         C   s  t  j } t j   } | j d  s | j d  rT t | j  d k rT d | _ q | j r | j	 j
 j j   d k r d | _ q d | _ n  t j t j   d t j |    } | j d k rc| j d	  rc| j | j d
 <| j d  r(t | j  d k r(| j d |   d | j d <qE|  | j k rJ| j d |   qE| j d |  | j  n| j d k r| j d  rt | j  d k r| j d |  | j  | j | j d
 <n| j d k r| j d	  r| j | j d
 <|  | j k r| j d |   n | j d |  | j  | j j d  st| j j d  r| j j } t j |  | k rqd  SqqE| j j j d  rd  S| j j j d  rE| j j j d  rE| j d =qEn| | j d k r| j d |   nZ | j d k r| j d |   n8 | j d k r5| j d | j |   n | j d |   | j r| j r| j | _ | j j d d  d k | _ | j j d  | _ n  t | j  | j d  <| j d!  r| j d"  r| j r| j | j d# <n6 | j j d"  r | j d" r | j d" | j d# <n  | j j d$  rI| j d$ | j d% <n4 | j d&  r}| j r}t j  | j  | j d% <n  | j j d'  r| j d' | j d( <qn  | j rx| j j d)  st!   | j d) <n  d* } | j j d+  rd, } n  | j dM k rd/ } n  xe | j j" D]# } | j# d0 k r| | d1 <PqqW| j j" j$ t j% i d0 d2 6| d1 6|  d3 6  n  x4 t j& |   j'   D] \ }	 }
 |
 | j d4 |	 <qWt( j( |  |  d5 d6 l  m) } t* d  k r| j,   a* n  i  } x | j D] } | j d7  s| j- rCt. j- d  |  | d7 <| d7 sCqqCn  d8 } | j d9  rd| j/ } n  | j d:  r| j0 } n  | | j | j- dN  d k r| | f | | j- <qqWt j1   } xB| j2   D]4\ } } t3 | | j-  } d  } | j d;  s| d; r)| j d< d   | d; <n  | j d;  r[y t4 j5 | j6  } Wq[q[Xn  | sy t7 j8 |  j9 } Wq| j j d;  ry t4 j5 | j j6  } WqqXqqXn  | st j   } n  t j |  | d; <t. j. | |  } | j:   j; d=  } | j<   x9 t j= |   D]( } t> j? | | d> d? } | s%Pq%q%W| st7 j@ jA |  rt7 jB |  qqn  tC | | |  t* d  k r| j j d7 | j j d@ d    } | r	tD |  tE k r| j; d=  } n  | t* t3 d8 | j-  <q	qqWt* r	t* jF   n  t j |   r 
g  | j D] } | j d;  r5	| j6 ^ q5	} | jG   | r	t jH dA | d5  | j d <n- | j j d  r	t j | j j  g } n  | s	| d5 | k  r 
dB t j |   } | j |  | | j d <q 
n  | j dC k rZ
| j j d  r.
| j d =n  | j j d  r| j d | j d <qn | j dD k ry
dE | j d <n | j dF k r
dG | j d <n | j d k r
dH | j d <nd | j d k r
dI | j d <nE | j d k r
dJ | j d <n& | j d k rdK | j | j d <n  t7 j@ jA |  s=t7 jI |  n  tJ jK dL t  jL  } t. jM | jN | j | j | j  tC | j:   j; d=  t3 | |    | j<   d  S(O   Nt   statust   entriesi    i   t   timeouti  i  iQ t   urlt   planet_http_locations
   No data %ss   no datat   planet_messages   Updating feed %ss   Updating feed %s @ %si-  s    Feed has moved from <%s> to <%s>i0  s   Feed %s unchangeds   Feed %s unchanged @ %st   planet_updateds   no activity int	   duplicatei  s   Feed %s gones   Feed %s timed outi  s   Error %d while updating feed %st   planet_bozot   truet   planet_formatt   planet_http_statust   headerst   etagt   planet_http_etags   last-modifiedt   planet_http_last_modifiedt   modifieds   -content-hasht   planet_content_hasht   linkss   application/atom+xmlt   rsss   application/rss+xmlt   rss090t   rss10s   application/rdf+xmlt   selft   typet   relt   hreft   planet_i(   t   idindext   idR   t	   publishedt   updatedt   updated_parsedt   published_parseds   utf-8t   modet   filtert   links   %Y-%m-%dT%H:%M:%SZs   no activity in %d daysi   i  s   403: forbiddeni  s   404: not founds   408: request timeouts	   410: gones   internal server errors   http status %ssD   <feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>
(   s   rss090s   rss10(   R   (O   t   planett   loggert   configt   cache_sources_directoryt   has_keyR   R-   R,   t   bozot   bozo_exceptiont	   __class__t   __name__t   lowert   timet   gmtimet   activity_thresholdR/   t   feedt   warningt   infoR2   t
   feedparsert   _parse_date_iso8601R1   t
   startswitht   errort   versiont   getR
   R9   R8   R<   t   asctimet   listR>   RD   t   appendt   FeedParserDictt   feed_optionst   itemst   scrubRG   t   indext   NoneR   RH   t   reconstituteRI   RJ   t   cache_directoryt   valuesR   t   calendart   timegmRK   R   t   statt   st_mtimet   toxmlR   t   unlinkt   filterst   shellt   runR   t   existst   removeR   RC   R   R    t   sortt   strftimet   makedirsR    t   parseStringt   xmlnst   sourcet   documentElement(   t   feed_urit	   feed_infot   datat   logt   sourcest   activity_horizonRJ   t   feedtypeRO   t   namet   valueRG   t   idst   entryt   cachet
   cache_fileR$   R"   t   outputRN   t   feedidt   msg(    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyt
   writeCacheE   sV   	$	&$3
	  	 	
"" 
 	   
  $   
   "%c      	   C   s  d d  l  } d d l m } | j t j    } | j d t  \ } } x| r| j d | |   t	 d  }	 t
 |	 d |  t
 |	 d t j i d d	 6  yy_ t | t  r | j d
  }
 n | j d  j d
  }
 |
 | k r| j d | |
  n  Wn | j d |  | }
 n Xi  } | j j d  rO| j d | d <n  | j j d  ru| j d | d <n  | j |
 d d | \ } } t | pd  j   | d <| j d k r| j rd | _ q| j j d  r| j d | d k rd | _ qn  t	 |  }	 t
 |	 d | j d |   | j d  rL| d =n  t
 |	 d |  WnD| k
 r| j d | |   n!| j k
 r} | j d t |  |   n t j k
 r} | j j j   d k rd |	 j d	 <| j  d |   q| j d t |  |   n t! k
 r} d d  l" } d d  l# } | j$   \ } } } | j d |  x> | j% | |  | j& |  D] } | j | j'    qWn X| j( d t d  | | |	 f  | j d t  \ } } qL Wd  S(!   Ni(   t   BadStatusLinet   blocks   Fetching %s via %dR   R/   R8   t   500R,   R   s   utf-8s   IRI %s mapped to %ss   unable to map %s to a URIR:   s   If-None-MatchR;   s   If-Modified-Sincet   GETs   -content-hashi   i0  R=   s   content-locations   content-encodings&   Bad Status Line received for %s via %ds   HttpLib2Error: %s via %dR.   t   408s   Timeout in thread-%ds   HTTP Error: %s in thread-%ds   Error processing %st   item()   t   httplib2t   httplibR   t   HttpRR   t   http_cache_directoryRe   t   TrueR_   R   t   setattrR`   Ri   R	   R   R   R   R]   RT   t   requestR   R   R,   t	   fromcacheRc   t   HttpLib2ErrorR
   t   socketRW   RX   RY   R8   t   warnt	   Exceptiont   syst	   tracebackt   exc_infot   format_exception_onlyt	   format_tbt   rstript   put(   t   thread_indext   input_queuet   output_queueR   R   R   t   hR)   R   R]   R   R8   t   respt   contentt   eR   R   RC   R   t   tbt   line(    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyt
   httpThread  st    		 
	

	c         C   s  t  j } t a t j   } y' t j t |   | j	 d |  WnT y3 d d l
 } | j t |   | j	 d |  Wq | j d |  q Xn Xd d l m } d d l m } |   } |   } i  } t j   }	 |	 rt j j |	  rt j |	  n  t t j    rqxc t t t j     D]9 }
 | d t d |
 | | | f  | |
 <| |
 j   q1Wn | j	 d	  x t j   D] } t j   } t | |  } t j |  } | j r|  r| j	 d
 |  qn  | j j  d d  d k r| j	 d |  qn  | r<t" |  r<| j# d | | f  q| j# d | | | f  qWx$ | j$   D] } | j# d d#  qfWi  } xU| j%   s| j%   s| rx) | j%   d k r| rt& j' d  qWx| j%   r| j  t(  \ } } } yt) | d  s%t | j* j+  d k  ri  } t) | d  r| j j  d d  | d <y" t& j, | j j  d d   } WqqXn  t j | |  } nR t j- i d d 6| j* d 6g  d 6i  d 6| j. d 6d d 6t | j* j+  d 6 } | j j  d d  } | s| j j  d d  } n  | } | j/ d  r=| j0 } n  d } | r^| | k r^| } n | ry| | k ry| } n  | rd | | | j d <| j1 d | | | f  | r| | j d  <qn  | r| | | <n  | r| | | <n  t2 | | |  Wqt3 k
 r} d d l4 } d d l5 } | j6   \ } } } | j7 d! |  x> | j8 | |  | j9 |  D] } | j7 | j:    qnWqXqWxD | j$   D]6 a | t j;   s| t =| s| j	 d"  qqqWqWd S($   s!    Spider (fetch) an entire planet s    Socket timeout set to %d secondsiNs+   Timeout set to invalid value '%s', skipping(   t   Queue(   t   Threadt   targett   argss   Building work queues   Feed %s already in cacheR7   t   410s   Feed %s goneR   i    g?R8   i,  R]   R:   R9   R;   Rd   R-   RE   RU   R,   RH   s   duplicate subscription: R1   s!   Duplicate subscription: %s and %sR0   s   Error processing %ss%   Finished threaded part of processing.(   NN(<   RP   RQ   R   Rm   RR   t   feed_timeoutR   t   setdefaulttimeoutt   floatR_   t   timeoutsockett   setDefaultSocketTimeoutR^   R   t	   threadingR   R   R   R   R{   R   t   intt   spider_threadsR   R   t   startt   subscriptionsRS   R   R`   t   parseR]   Re   Rn   R+   R   t   keyst   qsizeRZ   t   sleept   Falset   hasattrR8   R,   t   strptimeRi   R/   RT   RE   R   R   R   R   R   R   Rc   R   R   R   t   isAlive(   t   only_if_newR   R.   R   R   R   t   fetch_queuet   parse_queuet   threadst
   http_cacheR   R)   R   t   feed_sourceR   t   threadt
   feeds_seenR]   t   optionsR<   R   RH   RE   R3   R   R   R   RC   R   R   R   (    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyt   spiderPlanetd  s    			!(  			   ("   t   __doc__RZ   Rr   t   reR   R(   t   xml.domR    RP   RR   R`   Ro   Ry   R   Rl   R   t   hashlibR   R   t   compileR   R   R   R   R   Rm   R   Rn   R   R+   R   R   R   R   (    (    (    s>   /home/sa3ruby/intertwingly.net/code/venus-bzr/planet/spider.pyt   <module>   s&   <T	 			I