require 'open-uri'
require 'rexml/document'
require 'timeout'
require 'uri'

url = 'http://top500.feedster.com/top500.opml'

outline = /<outline type="(.*?)" title="(.*?)" description="(.*?)" xmlUrl="(.*?)" htmlUrl="(.*?)" \/>/

link = /<\s*link\s+(.*?)>/m
 
feedtypes = %w(application/atom+xml application/rss+xml application/rdf+xml)

cats = {:none      => "Do not have autodiscovery", 
        :match     => "top500 references the sites indicated prefered feed",
        :found     => "top500 references a feed mentioned in autodiscovery other than the preferred one",
        :different => "top500 references a feed not mentioned in autodiscovery",
        :defunct   => "unable to fetch this page"}

bin = {}
cats.keys.each {|cat| bin[cat] = []}

class String
  def escape
    REXML::Text.normalize self
  end
end

class URI::HTTP
  def match(uri)
    self.to_s.sub('http://www.','http://') == uri.sub('http://www.','http://')
  end
end

count = 0

open(url).each_line { |line|
  attrs = outline.match(line)
  next unless attrs
  title, description, xmlUrl, htmlUrl = attrs.to_a[2..-1]
  count += 1

  puts "#{count}: #{title}"
  STDOUT.flush

  if description.empty?
    entry="<!-- #{count} --><a href='#{htmlUrl.escape}'>#{title.escape}</a>"
  else
    entry="<!-- #{count} --><a href='#{htmlUrl.escape}' alt='#{description.escape}'>#{title.escape}</a>"
  end

  entry = entry + " (<a href='#{xmlUrl.escape}'>top500</a>)"

  begin
    handle = open(htmlUrl)
  rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
    begin
      puts "retrying..."
      sleep 5
      handle = open(htmlUrl)
    rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
      puts "retrying..."
      sleep 60
      begin
        handle = open(htmlUrl)
      rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
        bin[:defunct].push entry
        next # give up
      end
    end
  end

  feeds = []

  handle.read.scan(link).each do |match|
    attrs=match[0].scan(/(\w+)="(.*?)"/) + match[0].scan(/(\w+)='(.*?)'/)
    attrs=Hash[*attrs.flatten]
    if attrs['rel'] and attrs['rel'].downcase.split.include? "alternate"
      if attrs['type'] and feedtypes.include? attrs['type']:
        feeds.push URI.join(xmlUrl, attrs['href'].strip) if attrs['href']
      end
    end
  end

  if feeds.length == 0
    bin[:none].push entry
  elsif feeds[0].match(xmlUrl)
    bin[:match].push entry
  elsif feeds.find {|feed| feed.match(xmlUrl)}
    bin[:found].push entry + " (<a href='#{feeds[0].to_s.escape}'>autodiscovery</a>)"
  else
    bin[:different].push entry + " (<a href='#{feeds[0].to_s.escape}'>autodiscovery</a>)"
  end

  raise "hell" unless count == bin.values.inject(0) {|a,b| a+b.length}
  handle.close
}

open("top500.html","w") do |output|
  output.puts "<html xmlns='http://www.w3.org/1999/xhtml'>"
  output.puts "<head>"
  output.puts "<title>Feedster top500, categorized</title>"
  output.puts "</head>"
  output.puts "<body>"

  [:none, :match, :found, :different, :defunct].each do |name|
    output.puts "<h2 id='#{name.to_s}'>#{cats[name].escape}</h2>"
    output.puts "(count: #{bin[name].length})"
    output.puts "<ul>"
    bin[name].each {|entry| output.puts "<li>#{entry}</li>"}
    output.puts "</ul>"
  end

  output.puts "</body>"
  output.puts "</html>"
end
