require "rexml/document"
require 'test/unit'

# Parse an atom:title, atom:summary, atom:content, or atom:rights element and
# return its contents as text; i.e., unencode that which is encoded, and
# return the rest as is.
#
# This processing model is described in sections 3.1.1 and 4.1.3 of
# the Atom Syndication Format.  Most of the bits relevant to the subject
# at hand can be found at: <http://tinyurl.com/945r5>.

def text element

  return nil if element.attribute("src")

  type = element.attribute("type")
  case type && type.value

    when "xhtml"
      throw "missing xhtml:div" if element.elements[1].name != "div"
      return element.elements[1].to_a.to_s.strip

    when "html"
      return element.text.strip

    when "text", nil
      return element.to_a.to_s.strip

    when /^text\//i, /\+xml$/i, /\/xml$/i
      return element.to_a.to_s.strip

    else
      require 'base64'
      return Base64.decode64(element.text.gsub(/\s/,''))

  end
end

# TestText provides unit test cases for the text function described above.
# 
# One parse method is defined to take care of the small bit of administrivia
# necessary.  The remainder are the actual test cases, and all begin with the
# string "test_".  Each invokes the parse method with a given feed, and makes
# an assertion on what the expected results are.

class TestText < Test::Unit::TestCase

  # The parse method takes care of the following:
  # * parsing the feed as XML
  # * finding the first content element
  # * extracting the text from it

  def parse feed
    doc = REXML::Document.new feed
    ns = {"atom" => "http://www.w3.org/2005/Atom"}
    content = REXML::XPath.first doc, "//atom:content", ns
    return text(content)
  end

  # test content with no type attribute specified (default="text")
  def test_default
    assert_equal("AT&amp;T bought by SBC!",parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content>
             AT&amp;T bought by SBC!
          </content>
        </entry>
      </feed>
    END
  end

  # test content with type="text"
  def test_text
    assert_equal("AT&amp;T bought by SBC!",parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content type="text">
             AT&amp;T bought by SBC!
          </content>
        </entry>
      </feed>
    END
  end

  # test content with type="html"
  def test_html
    assert_equal("AT&amp;T bought <b>by SBC</b>!",parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content type="html">
             AT&amp;amp;T bought &lt;b&gt;by SBC&lt;/b&gt;!
          </content>
        </entry>
      </feed>
    END
  end

  # test content with type="xhtml"
  def test_xhtml
    assert_equal("AT&amp;T bought <b>by SBC</b>!",parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content type="xhtml">
            <div xmlns="http://www.w3.org/1999/xhtml">
               AT&amp;T bought <b>by SBC</b>!
            </div>
          </content>
        </entry>
      </feed>
    END
  end

  # test content with an xml mime type
  def test_xml
    assert_match(/<name>Codezoo Crawler<\/name>/,parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content type="application/rdf+xml">
            <Project xmlns="http://usefulinc.com/ns/doap#">
              <name>Codezoo Crawler</name>
            </Project>
          </content>
        </entry>
      </feed>
    END
  end

  # test content with an non-xml and non-text mime type
  def test_base64
    assert_equal("Send reinforcements",parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content type="encoded/base64">
            U2VuZCByZWluZm9yY2VtZW50cw==
          </content>
        </entry>
      </feed>
    END
  end

  # test out-of-line content
  def test_src
    assert_equal(nil,parse(<<-END))
      <feed xmlns="http://www.w3.org/2005/Atom">
        <entry>
          <content src="http://example.com/doc.pdf" type="application/pdf"/>
        </entry>
      </feed>
    END
  end
end
