ó
ìŠ'Gc           @   sj   d  Z  d d g Z d Z d d l Z d d l m Z d d d „  ƒ  YZ d	 „  Z e d
 k rf e ƒ  n  d S(   s8  
    This class is based on the Python recipe titled
    "Language detection using character trigrams"
    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/326576
    by Douglas Bagnall.
    It has been (slightly) adapted by Eric van der Vlist to support
    Unicode and accept a method to parse strings.
s   Douglas Bagnalls%   Eric van der Vlist <vdv@dyomedea.com>t   PythoniÿÿÿÿN(   t   urlopent   Trigramc           B   sn   e  Z d  Z d Z d d „ Z d d „ Z d „  Z d d „ Z d „  Z	 d	 „  Z
 d
 „  Z d „  Z d „  Z RS(   s%  
    From one or more text files, the frequency of three character
    sequences is calculated.  When treated as a vector, this information
    can be compared to other trigrams, and the difference between them
    seen as an angle.  The cosine of this angle varies between 1 for
    complete similarity, and 0 for utter difference.  Since letter
    combinations are characteristic to a language, this can be used to
    determine the language of a body of text. For example:

        >>> reference_en = Trigram('/path/to/reference/text/english')
        >>> reference_de = Trigram('/path/to/reference/text/german')
        >>> unknown = Trigram('url://pointing/to/unknown/text')
        >>> unknown.similarity(reference_de)
        0.4
        >>> unknown.similarity(reference_en)
        0.95

    would indicate the unknown text is almost cetrtainly English.  As
    syntax sugar, the minus sign is overloaded to return the difference
    between texts, so the above objects would give you:

        >>> unknown - reference_de
        0.6
        >>> reference_en - unknown    # order doesn't matter.
        0.05

    As it stands, the Trigram ignores character set information, which
    means you can only accurately compare within a single encoding
    (iso-8859-1 in the examples).  A more complete implementation might
    convert to unicode first.

    As an extra bonus, there is a method to make up nonsense words in the
    style of the Trigram's text.

        >>> reference_en.makeWords(30)
        My withillonquiver and ald, by now wittlectionsurper, may sequia,
        tory, I ad my notter. Marriusbabilly She lady for rachalle spen
        hat knong al elf

    Beware when using urls: HTML won't be parsed out.

    Most methods chatter away to standard output, to let you know they're
    still there.
    i    c         C   s)   i  |  _  | d  k	 r% |  j | ƒ n  d  S(   N(   t   lutt   Nonet	   parseFile(   t   selft   fn(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   __init__A   s    	s     c         C   sR   xK | D]C } |  j  j | i  ƒ } | j | d ƒ d | | <| d | } q W| S(   Ni    i   (   R   t
   setdefaultt   get(   R   t   linet   pairt   lettert   d(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   _parseAFragmentF   s
    c         C   s   |  j  | ƒ |  j ƒ  d  S(   N(   R   t   measure(   R   t   string(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   parseStringM   s    s
   iso-8859-1c         C   s„   d } d | k r! t  | ƒ } n t | ƒ } x< t | ƒ D]. \ } } |  j | j ƒ  j | ƒ d ƒ } q: W| j ƒ  |  j ƒ  d  S(   Ns     s   ://t    (   R   t   opent	   enumerateR   t   stript   decodet   closeR   (   R   R   t   encodingR   t   ft   zR   (    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyR   Q   s    &
c         C   s^   d } xD |  j  j ƒ  D]3 } | t g  | j ƒ  D] } | | ^ q/ ƒ 7} q W| d |  _ d S(   sX   calculates the scalar length of the trigram vector and
        stores it in self.length.i    g      à?N(   R   t   valuest   sumt   length(   R   t   totalt   yt   x(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyR   a   s    1c   	      C   s¾   t  | t ƒ s t d ƒ ‚ n  |  j } | j } d } xm | j ƒ  D]_ } | | k rC | | } | | } x6 | D]+ } | | k rp | | | | | 7} qp qp WqC qC Wt | ƒ |  j | j S(   s—   returns a number between 0 and 1 indicating similarity.
        1 means an identical ratio of trigrams;
        0 means no trigrams in common.
        s&   can't compare Trigram with non-Trigrami    (   t
   isinstanceR   t	   TypeErrorR   t   keyst   floatR   (	   R   t   othert   lut1t   lut2R   t   kt   at   bR!   (    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt
   similarityi   s    		

$c         C   s   d |  j  | ƒ S(   sc   indicates difference between trigram sets; 1 is entirely
        different, 0 is entirely the same.i   (   R,   (   R   R&   (    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   __sub__}   s    c         C   si   g  } d } xM | r[ |  j  | ƒ } | j | ƒ | d | } | d k r | d 8} q q Wd j | ƒ S(   s:   returns a string of made-up words based on the known text.s     i   s    	t    (   t   likelyt   appendt   join(   R   t   countt   textR)   t   n(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt	   makeWordsƒ   s    	c         C   sj   | |  j  k r d Sg  } x2 |  j  | j ƒ  D] \ } } | j | | ƒ q- Wd j | ƒ } t j | ƒ S(   ss   Returns a character likely to follow the given string
        two character string, or a space if nothing is found.R   R.   (   R   t   itemsR0   R1   t   randomt   choice(   R   R)   t   letterst   v(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyR/      s     N(   t   __name__t
   __module__t   __doc__R   R   R   R   R   R   R   R,   R-   R5   R/   (    (    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyR      s   ,					c          C   s9  t  d ƒ }  t  d ƒ } t  d ƒ } t  d ƒ } t  d ƒ } t  d ƒ } t  d ƒ } t  d ƒ } d	 GHd
 |  | GHd | |  GHd |  | GHd |  | GHd | | GHd | | GHd | | GHd | | GHd | |  GHd | | GHd | | GHd |  | GHd | | GHd | | GHd |  | GHd | | GHd  S(   Ns-   http://gutenberg.net/dirs/etext97/lsusn11.txts-   http://gutenberg.net/dirs/etext03/candi10.txts3   http://gutenberg.net/dirs/1/0/4/9/10492/10492-8.txts3   http://gutenberg.net/dirs/1/2/8/4/12844/12844-8.txts3   http://gutenberg.net/dirs/1/0/1/1/10117/10117-8.txts3   http://gutenberg.net/dirs/1/3/0/4/13041/13041-8.txts-   http://gutenberg.net/dirs/etext05/cfgsh10.txts3   http://gutenberg.net/dirs/1/3/7/0/13704/13704-8.txts   calculating difference:s   en - fr is %ss   fr - en is %ss   en - en2 is %ss   en - fr2 is %ss   fr - en2 is %ss   fr - fr2 is %ss   fr2 - en2 is %ss   fi - fr  is %ss   fi - en  is %ss   fi - se  is %ss   no - se  is %ss   en - no  is %ss   no - no2  is %ss   se - no2  is %ss   en - no2  is %ss   fr - no2  is %s(   R   (   t   ent   frt   fit   not   set   no2t   en2t   fr2(    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   test   s2    t   __main__(    (	   R=   t   __authors__t   __license__R7   t   urllibR   R   RF   R;   (    (    (    sX   /home/sa3ruby/intertwingly.net/code/venus-bzr/examples/filters/guess-language/trigram.pyt   <module>
   s   Œ	