Question

1 Approved Answer

Posted on Feb 06, 2024

implement the RecursiveCrawler class in Java . import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.io.; import java.util.; public class RecursiveCrawler extends Crawler { /** *

implement the RecursiveCrawler class in Java.

import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import java.io.*; import java.util.*; public class RecursiveCrawler extends Crawler { /** * Instantiates an empty crawler. */ public RecursiveCrawler() { super(); } /** * Initiate a crawl on the given page. * * @param pageFileName the page file name */ @Override public void crawl(String pageFileName) { /* Pseudocode: METHOD crawl(PAGE) add PAGE to the FOUND pages create a jsoup DOCUMENT from PAGE extract a list of LINKS from the DOCUMENT for every ITEM in the LINKS extract the LINKHREF from the ITEM if LINKHREF is not a valid page, add it to the SKIPPED pages and move on to the next item use the Util.relativeFileName(..) to get the LINKFILE if LINKFILE is already in the FOUND or SKIPPED pages move on to the next item if LINKFILE does not exist, add it to the SKIPPED pages and move on to the next item recursively visit LINKFILE as it is a valid page */ } }

Crawler.java

import java.util.*; /** * Abstract class that provides the structure for a web crawler. */ public abstract class Crawler { /** * Private field variable. */ protected ArraySet foundPages; /** * Private field variable. */ protected ArraySet skippedPages; /** * Constructor that creates an empty crawler. */ public Crawler() { foundPages = new ArraySet<>(); skippedPages = new ArraySet<>(); } /** * Initiate a crawl on the given page. Child classes should override this * * @param pageFileName the page file name */ public abstract void crawl(String pageFileName); /** * Returns the unique pages that have been found so far and are valid. Each item in the * returned list should be unique and refer to a valid file that exists. * * @return the list */ public List foundPagesList() { return foundPages.asList(); } /** * Returns the unique pages that have been skipped so far. These may be invalid as per * validPageLink, non-existent files, or links off of the local file system. * * @return the list */ public List skippedPagesList() { return skippedPages.asList(); } /** * Returns a string of pages that have been found so far. Each page is shown on its own * line terminated with a * * @return the string */ public String foundPagesString() { StringBuilder sb = new StringBuilder(); for (String page : foundPagesList()) { sb.append(page); sb.append(" "); } return sb.toString(); } /** * Returns a string of pages that have been skipped so far. Each page is shown on its own * line terminated with a * * @return the string */ public String skippedPagesString() { StringBuilder sb = new StringBuilder(); for (String page : skippedPagesList()) { sb.append(page); sb.append(" "); } return sb.toString(); } /** * Returns true if the given pageFileName is valid and false otherwise. Valid page links * do not start with http://, https://, file:// or javascript: but they are local file * names like A.html and subdir/D.html * Valid page links are to files ending with the .html or .HTML extension. Other file * types like .jpg, .png, .jpeg should generate a false return value which causes them to * be skipped. * * @param pageFileName the page file name * @return the boolean */ public static boolean validPageLink(String pageFileName) { return (pageFileName.endsWith(".html") || pageFileName.endsWith(".HTML")); } }

ArraySet.java

import java.util.*; /** * Concrete class that implements the Iterable interface. * * @param  the type parameter */ public class ArraySet> { /** * Private field variable. */ private ArrayList arraySet; /** * Creates an empty ArraySet. */ public ArraySet() { arraySet = new ArrayList<>(); } /** * Size int. * * @return the size of the set which is the number of unique items in it */ public int size() { return arraySet.size(); } /** * As list. * * @return the contents of the set as a shallow copy of the list */ public List asList() { return arraySet; } /** * Contains boolean. * * @param query the query * @return true if the query item is present in the set and false otherwise */ public boolean contains(T query) { for (T element : arraySet) { if (element.compareTo(query) == 0) return true; } return false; } /** * Ensures the specified item is unique. If the item is null, raises a RuntimeException. * * @param item the item * @return true if the given item is added to the set. False if the item is already present */ public boolean add(T item) { if (item == null) throw new RuntimeException("ArraySet does not support null items"); int index = Collections.binarySearch(arraySet, item); // if the item does not exist, add it to (-(insertion point) - 1) // otherwise, return false if (index < 0) { index = - index - 1; } else { return false; } // proceed to add the item at the sorted index arraySet.add(index, item); return true; } /** * Retrieves an item in the set that is equal to the query item. * * @param query the query * @return null if no item in the set is equal to the query */ public T get(T query) { for (int i = 0; i < size(); i++) { if (arraySet.get(i).equals(query)) return arraySet.get(i); } return null; } /** * The string should be identical in format to Lists making use of the toString method * * @return a string representation of the set and its contents */ public String toString() { return arraySet.toString(); } /** * Inner iterator class that implements hasNext() and next(). */ private class setIterator implements Iterator { private int currentIndex = 0; public Iterator iterator() { return new setIterator(); } @Override public boolean hasNext() { return currentIndex < size() && arraySet.get(currentIndex) != null; } @Override public T next() { return arraySet.get(currentIndex++); } } }