require 'open-uri'
require 'rexml/document'
require 'timeout'
require 'uri'
url = 'http://top500.feedster.com/top500.opml'
outline = //
link = /<\s*link\s+(.*?)>/m
feedtypes = %w(application/atom+xml application/rss+xml application/rdf+xml)
cats = {:none => "Do not have autodiscovery",
:match => "top500 references the sites indicated prefered feed",
:found => "top500 references a feed mentioned in autodiscovery other than the preferred one",
:different => "top500 references a feed not mentioned in autodiscovery",
:defunct => "unable to fetch this page"}
bin = {}
cats.keys.each {|cat| bin[cat] = []}
class String
def escape
REXML::Text.normalize self
end
end
class URI::HTTP
def match(uri)
self.to_s.sub('http://www.','http://') == uri.sub('http://www.','http://')
end
end
count = 0
open(url).each_line { |line|
attrs = outline.match(line)
next unless attrs
title, description, xmlUrl, htmlUrl = attrs.to_a[2..-1]
count += 1
puts "#{count}: #{title}"
STDOUT.flush
if description.empty?
entry="#{title.escape}"
else
entry="#{title.escape}"
end
entry = entry + " (top500)"
begin
handle = open(htmlUrl)
rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
begin
puts "retrying..."
sleep 5
handle = open(htmlUrl)
rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
puts "retrying..."
sleep 60
begin
handle = open(htmlUrl)
rescue Timeout::Error, OpenURI::HTTPError, Errno::ECONNRESET, SocketError, Errno::ECONNREFUSED
bin[:defunct].push entry
next # give up
end
end
end
feeds = []
handle.read.scan(link).each do |match|
attrs=match[0].scan(/(\w+)="(.*?)"/) + match[0].scan(/(\w+)='(.*?)'/)
attrs=Hash[*attrs.flatten]
if attrs['rel'] and attrs['rel'].downcase.split.include? "alternate"
if attrs['type'] and feedtypes.include? attrs['type']:
feeds.push URI.join(xmlUrl, attrs['href'].strip) if attrs['href']
end
end
end
if feeds.length == 0
bin[:none].push entry
elsif feeds[0].match(xmlUrl)
bin[:match].push entry
elsif feeds.find {|feed| feed.match(xmlUrl)}
bin[:found].push entry + " (autodiscovery)"
else
bin[:different].push entry + " (autodiscovery)"
end
raise "hell" unless count == bin.values.inject(0) {|a,b| a+b.length}
handle.close
}
open("top500.html","w") do |output|
output.puts ""
output.puts "
"
output.puts "Feedster top500, categorized"
output.puts ""
output.puts ""
[:none, :match, :found, :different, :defunct].each do |name|
output.puts "#{cats[name].escape}
"
output.puts "(count: #{bin[name].length})"
output.puts ""
bin[name].each {|entry| output.puts "- #{entry}
"}
output.puts "
"
end
output.puts ""
output.puts ""
end