Question
Lab: Create a topical crawler in R Due: Submit your R code along with a screenshot of your results. Requirements: Follow the basic crawling algorithm
Lab: Create a topical crawler in R
Due: Submit your R code along with a screenshot of your results.
Requirements:
Follow the basic crawling algorithm provided in the slides
Crawl 50 pages for your repository
Only store websites which contain have at least one term in the body text from a list of keywords chosen by your group
Store the following information in a character vector:
Error checking requirements:
If the link in the frontier is a link to a jpg, go to the next item in the frontier
If the retrieved page is less than 10 characters, go to the next item in the frontier
Check for relative/absolute paths when adding to the frontier
You may come across other implementation challenges during testing
Hints:
Packages that will be useful: RCurl, XML, stringr, httr
getURL call:
doc <- tryCatch(getURL(exploredlink),error=function(cond){return("")})
get the title:
titleText <- xmlToDataFrame(nodes = getNodeSet(doc, "//title"))
titleText <- as.vector(titleText$text)
titleText <- unique(titleText)
Retreives the body text from a page:
bodyText<-tryCatch(htmlToText(content(GET(exploredlink),type="text/html",as="text")),error=function(cond){return("")})
Parses words into a vector:
bodyText<-str_split(tolower(str_replace_all((str_replace_all(bodyText,"(\\t|\ |\ )"," ")),"\\s{2,}"," "))," ")[[1]]
Parsing links from a page:
anchor <- getNodeSet(doc, "//a")
anchor <- sapply(anchor, function(x) xmlGetAttr(x, "href"))
any() operator will check for true values in a logical vector
x %in% y will check for x membership in
Given code format :
#Write a topical crawler using the information provided below:
##Start your code with these libraries: library(RCurl) library(XML) library(stringr) library(httr)
htmlToText <- function(input, ...) { ###---PACKAGES ---### require(RCurl) require(XML) ###--- LOCAL FUNCTIONS ---### # Determine how to grab html for a single input element evaluate_input <- function(input) { # if input is a .html file if(file.exists(input)) { char.vec <- readLines(input, warn = FALSE) return(paste(char.vec, collapse = "")) } # if input is html text if(grepl("", input, fixed = TRUE)) return(input) # if input is a URL, probably should use a regex here instead? if(!grepl(" ", input)) { # downolad SSL certificate in case of https problem if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm") return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm")) } # return NULL if none of the conditions above apply return(NULL) } # convert HTML to plain text convert_html_to_text <- function(html) { doc <- htmlParse(html, asText = TRUE) text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue) return(text) } # format text vector into one character string collapse_text <- function(txt) { return(paste(txt, collapse = " ")) } ###--- MAIN ---### # STEP 1: Evaluate input html.list <- lapply(input, evaluate_input) # STEP 2: Extract text from HTML text.list <- lapply(html.list, convert_html_to_text) # STEP 3: Return text text.vector <- sapply(text.list, collapse_text) return(text.vector) }
###Run the function code for htmlToText()(Be sure this function is listed in your Environment)
###Load the first element in the frontier to an "exploredlink" variable
frontier <- c("http://www.cnn.com","http://www.kdnuggets.com","http://news.google.com")
topicwords<-c("technology","school","web","mining","news")
num <- 50 #total number of items to crawl result <- c() j <- 0 #number of items in the repository
while (j < num){
if(length(frontier)<1){ break } #grab the first item in the frontier and place in the "exploredlink" variable exploredlink<-frontier[1] frontier<-frontier[-1] if(str_detect(exploredlink,"\\.jpg$")) { next }
#fill in your code here }
############ USEFUL CODE SNIPPETS ########
#How to get HTML doc <- tryCatch(getURL(exploredlink),error=function(cond){return("")})
if(str_length(doc)<10){ next }
doc <- htmlParse(doc)
domain<-str_extract(exploredlink,pattern = ".*\\.com")
if(is.na(domain)){ next }
###
#How to get a title titleText <- tryCatch(xmlToDataFrame(nodes = getNodeSet(doc, "//title")),error=function(cond){return("")}) if(titleText==""){ next } titleText <- as.vector(titleText$text) titleText <- unique(titleText)
###
#How to get body text bodyText<- tryCatch(htmlToText(content(GET(exploredlink),type="text/html",as="text")),error=function(cond){return("")})
bodyText<-str_split(tolower(str_replace_all((str_replace_all(bodyText,"(\\t|\ |\ )"," ")),"\\s{2,}"," "))," ")[[1]]
###
#How to get links from a page anchor <- getNodeSet(doc, "//a") anchor <- sapply(anchor, function(x) xmlGetAttr(x, "href"))
if(length(anchor)>0){ temp <- c() for(i in 1:length(anchor)){ if(is.null(anchor[[i]])){ next } if(!str_detect(anchor[[i]][1],"^http")){ next } if(str_detect(anchor[[i]][1],domain)){ next } temp <- append(temp,str_trim(anchor[[i]][1])) } anchor <- temp rm(temp) frontier<-append(frontier,anchor) frontier <- unique(frontier) }
###
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started