Question
hi can someone -edit the crawl function to allow the command line to enter how many urls to crawl - edit it so that it
hi can someone
-edit the crawl function to allow the command line to enter how many urls to crawl
- edit it so that it puts the sourcecode of each page visted in a directory specified in the command line
"""Usage: crawler.py seed_url
seed: absolute url - the crawler will use it as the initial web address
"""
import urllib.request
import urllib.parse
import urllib.error
import urllib.robotparser
import re
import sys
# DO NOT CHANGE ok_to_crawl!!!
def ok_to_crawl(absolute_url):
"""
check if it is OK to crawl the specified absolute url
We are implementing polite crawling by checking the robots.txt file
for all urls except the ones using the file scheme (these are urls
on the local host and they are all OK to crawl.)
We also use this function to skip over mailto: links and javascript: links.
Parameter:
absolute_url (string): this is an absolute url that we would like to crawl
Returns:
boolean: True if the scheme is file (it is a local webpage)
True if we successfully read the corresponding robots.txt
file and determined that user-agent * is allowed to crawl
False if it is a mailto: link or a javascript: link
if user-agent * is not allowed to crawl it or
if it is NOT an absolute url.
"""
if absolute_url.lower().startswith('mailto:'):
return False
if absolute_url.lower().startswith('javascript:'):
return False
link_obj=urllib.parse.urlparse(absolute_url)
if link_obj.scheme.lower().startswith('file'):
return True
# check if the url given as input is an absolute url
if not link_obj.scheme or not link_obj.hostname:
print('Not a valid absolute url: ', absolute_url)
return False
#construct the robots.txt url from the scheme and host name
else:
robot_url= link_obj.scheme+'://'+link_obj.hostname + '/robots.txt'
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robot_url)
try:
rp.read()
except:
print ("Error accessing robot file: ", robot_url)
return False
else:
return rp.can_fetch("*", absolute_url)
# DO NOT CHANGE crawl!!!
def crawl(seed_url):
"""
start with the seed_url and crawl up to 10 urls
Parameter:
seed_url (string) - this is the first url we'll visit.
Returns:
set of strings - set of all the urls we have visited.
"""
urls_tocrawl = {seed_url} # initialize our set of urls to crawl
urls_visited = set() # initialize our set of urls visited
while urls_tocrawl and len(urls_visited) < 10:
current_url = urls_tocrawl.pop() # just get any url from the set
if current_url not in urls_visited: # check if we have crawled it before
page = get_page(current_url)
if page:
more_urls = extract_links(current_url, page) # get the links
urls_tocrawl = urls_tocrawl | more_urls # add them to be crawled
urls_visited.add(current_url)
return urls_visited
#------------Do not change anything above this line-----------------------------
def get_page(url):
"""
generate a web page of html in string from url
params: absolute url as string
return: if there is URLError or DecodeError, return an empty string
else return the full html page content as string
"""
try:
with urllib.request.urlopen(url) as url_file:
page_string = url_file.read().decode('UTF-8')
return page_string
except urllib.error.URLError as url_err:
print("Error opening url: ", url, url_err)
return " "
except UnicodeDecodeError as decode_err:
print("Error decoding url", url, decode_err)
return " "
def extract_links(base_url, page):
"""
extract the links contained in the page at the base_url
Parameters:
base_url (string): the url we are currently crawling - web address
page(string): the content of that url - html
Returns:
A set of absolute urls (set of strings) - These are all the urls extracted
from the current url and converted to absolute urls.
"""
urls_set = set()
page_links = re.findall('
# Convert each link to an absolute url
urllib.parse.urljoin(base_url, link)
urls_set.add(link)
def main():
if len(seed_url) != 2:
if len(seed_url) == 2:
with open('crawled.txt', 'w', encoding='utf-8') as new_file:
new_file.write(url + " ")
main()
__author__ = 'xxx'
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started