#!/usr/bin/env ruby # # Downloads the files on the given URL for the matching regexs # # Example: download.rb "http://some.com/thing.html" "*.rb" # require 'net/http' # ================================================== # Functions # ================================================== def log(msg) STDERR.puts "*** " + msg end def print_usage() STDERR.puts "Usage: download +" end # String, Regexp[], String -> Boolean # Adds str if it matches a RE in res def maybe_add(str,res,array) return if array[str] res.each do |re| if re.match str array[str] = true return end end end # ================================================== # Main # ================================================== if ARGV.length == 0 STDERR.puts "URL required" print_usage exit 1 end if ARGV.length == 1 STDERR.puts "At least one regex required" print_usage exit 1 end # Get the main URL and remove the protocol url = ARGV.shift url = url.gsub /[^\:]+\:\/\//, "" # create the regexs and save them from the command line regex_strings = [] ARGV.each do |re| regex_strings.push re end # split the url into a host and page islash = url.index "/" host = url[0..islash-1] page = url[islash..url.length] log "HOST: " + host log "PAGE: " + page h = Net::HTTP.new host, 80 resp, data = h.get page, nil if resp.message != "OK" log "ERROR: #{resp}" exit 1 end # create the array of regexps regexs = [] regex_strings.each do |regex_string| # make sure e.g. "*.java" -> ".*.java" # this isn't complete, but should do regex_string = regex_string.gsub /(?!\.)\*/, ".*" regex_string = regex_string.gsub /(?!\.)\+/, ".+" re = Regexp.new regex_string, true regexs.push re end # String[link] -> True links = {} # ----------------------------------------------------- # Good interview question: # 1. Write a regular expression to capture the link # in a vaild 'href' tag # 2. Write a regular expression to capture the link # in a 'href' tag where attributes are surrounded # by actual characters # 3. Write a regular expression to capture the link # in a 'href' tag where attributes are surrounded # by characters from an infinite set of characters # ----------------------------------------------------- # href="asdf" data.scan /]+href="([^"]+)"[^>]*>/mi do |res| maybe_add res[0], regexs, links end # href='adsf' data.scan /]+href='([^']+)'[^>]*>/mi do |res| maybe_add res[0], regexs, links end # href=asdf data.scan /]+href=((?!")|(?!'))([^\s]+)((?!")|(?!'))[^>]*>/mi do |res| maybe_add res[0], regexs, links end relative_dir = page.gsub /\/[^\/]+$/, "" # Different links could have the same canonical forms seen_pages = {} links.keys.each do |link| # create the page name if /^\/.*/.match link page = link elsif !/[^\:]+\:\/\//.match link page = relative_dir + "/" + link end # make sure we haven't already done this one next if seen_pages[page] # download the file outfile_name = page.gsub /^\//, "" log "Downloading " + page + " -> " + outfile_name + "..." h = Net::HTTP.new host, 80 resp, data = h.get page, nil if resp.message != "OK" log "ERROR: #{resp}" next end # make sure we make the directories parts = outfile_name.split "/" # arg, windows! for i in (0..parts.length-2) dir = parts[i] Dir.mkdir dir if !File.exist? dir end # write to the outfile File.open outfile_name, "w" do |outfile| outfile.print data end seen_pages[page] = true end