require 'open-uri' require 'nokogiri' SLEEP_TIME = 3 root_url = 'http://www.capcom.co.jp/arcade/rev/PC/ranking_highscore.html' page_url_base = root_url detail_url_base = 'http://www.capcom.co.jp/arcade/rev/PC' charset = nil DIFFICULTIES = { '../common/img_common/bnr_difficulty_easy.png' => 'esy', '../common/img_common/bnr_difficulty_standard.png' => 'std', '../common/img_common/bnr_difficulty_hard.png' => 'hrd', '../common/img_common/bnr_difficulty_master.png' => 'mas', '../common/img_common/bnr_difficulty_unlimited.png' => 'unl', } def parse_page(url) charset = nil begin html = open(url) do |f| charset = f.charset f.read end sleep SLEEP_TIME Nokogiri::HTML.parse(html, nil, charset) rescue sleep 5 retry end end # --begin doc = parse_page(root_url) page_urls = [] doc.xpath('//div[@class="mPager-top"]/a[@class="pageing" or @class="pageing2"]').each do |node| page_urls << "#{page_url_base}#{node.attribute('href').value}" end page_urls.uniq! page_urls.each do |page_url| page_doc = parse_page(page_url) page_doc.xpath('//ul[@class="rkLinkList"]/li/a').each do |node| detail_doc = parse_page(node.attribute('href').value.gsub(/\A\./, detail_url_base)) container = [] container << File.basename(detail_doc.xpath('//div[@class="rkHiscoreMusicCv"]/img').attribute('src').value) container << DIFFICULTIES[detail_doc.xpath('//div[@class="rkHiscoreDiffBlock"]/img').attribute('src').value] detail_doc.xpath('//div[@class="rkHiScoreDetail"]').each do |record| container << record.css('p.rankScore').text.to_i end STDERR.puts container[0, 2].join("\t") puts container.join("\t") end end