The activities for this part are as follows:

1.Review the program closely and make sure you understand the program fully.

2.Use MS WORD to write a report that explains how the application works. Be sure to include the following items

=> On each section of the code (functions/methods & significant portion of code inside them), explain in great details as to how it works. Assume the reader has little experience in HDFS and searching methods. The reader is an experienced Python programmer. In other words, I am not interested in how Python works but in how the application work. => Include diagrams (UML, PowerPoint, draw.io, etc.) and screenshots to support your writings. For example show me how this program interact with the HDFS file system and/or the node manager, resource manager, etc. =>Provide critical (not nasty or unprofessional) evaluation of the code and propose a better solution. Please do not just say the code is good and I have no comment! That will not get you a very good grade. Provide me with three (3) possible improvements.

38 of index maker.py u til.py do readme.md #!/usr/bin/python3 from pyspark import Sparkcontext, Sparkconf import util import collections import sys import os import shutil "For a given input directory that has file with an extension of .txt create the same file with an extention of .idx store in the output directory class IndexMaker(): definit__(self): self.conf = Sparkconf().setAppName("Index Maker App) self.sc = SparkContext(conf=self.conf) def word count (self, file name): self.index_by_word = {} self text = self.sc.textFile(file_name) self.counts = self.text \ .flatmap(lambda line: util.strip punc(line) .map(lambda word: (word. lower(), 1)) \ . reduceByKey (lambda a, b: a + b) def make_index(self): for items in self.counts.collect(): if items[0].strip != '': self.index_by_word[items [0]] = [] line no = 1 for line in self.text.collect(): line = util.strip punc(line) for word in line: if word.strip() != ": self.index_by_word[word. Lower()].append(line_no) line_no += 1 def write index(self, local_file name) : if os.path.exists(local_file_name): os.remove(local file name) local_file = open(local_file name, 'W') self.index by word = collections.OrderedDict(sorted(self.index by word. items())) for word in self.index_by_word: local file.write(word + ' ' + str(self.index_by word[word]) + In astaticmethod def prepare_dirs(argv): in dir = argv[0] out_dir = argv[1] Local dir = '/tmp/id index_maker.py local file.write(word + ' ' + str(self.index by word[word]) + n) @staticmethod def prepare_dirs(argv): in dir = argv[O] out_dir = argv[1] local dir = '/tmp/idx/ if os.path.exists(local_dir): shutil.rmtree(local_dir) os.mkdir(local dir) util.delete_out_dir(out_dir) in file names = util.get_file_names (in dir) out_file_names = [] for file_name in in file_names: out_file_names.append(file name.replace('.txt', '.idx')) return in dir, out_dir, in file names, out_file_names, local dir @staticmethod def prepare file_names(in dir, local dir, in_file_name, out_file_name) : in_file_name = in dir + '/' + in_file_name local_file_name = Local dir + out_file_name print(' in:', in file_name) print('out:', local_file_name) return in_file_name, local_file_name @staticmethod def main(argv): in dir, out_dir, in file names, out_file names, local_dir = IndexMaker.prepare_dirs(argv) im = IndexMaker() for i in range(0, len(in_file_names)): in_file_name, local_file_name = IndexMaker.prepare_file_names (in dir, local_dir, in_file_names[i], out_file_names[i]) im.word_count(in_file name) im.make_index im.write_index (local_file_name) print('inin') print(" in_dir:', in dir) print("out_dir:', out_dir) print('local_dir:', local_dir) util.copy_from_local (local_dir, out_dir) name == main : IndexMaker.main(sys.argv[1:1) readme.md index maker.py u til.py sq!/usr/bin/python3 import string import re import subprocess def delete_out_dir(out_dir): subprocess.call(["hdfs", "dfs", "-rm", "-R", out_dir]) def copy_from local(local_files, remote dir): print(local_files, remote_dir) subprocess.call(["hdfs", "dfs". "-copyFromLocal", local files, remote dir]) def get_file_names (in_dir): buf = file names = [] with subprocess. Popen ( ["hdfs", "dfs", "-Is", in dir], stdout=subprocess.PIPE) as proc: buf t= re. sub'It', '', str(proc.stdout.read()); for line in buf.split('lin): tokens = [] for token in line.split(' '): if token.strip() != '': tokens.append(token) if len(tokens)

Question

image text in transcribed

The activities for this part are as follows:

1.Review the program closely and make sure you understand the program fully.

2.Use MS WORD to write a report that explains how the application works. Be sure to include the following items

=> On each section of the code (functions/methods & significant portion of code inside them), explain in great details as to how it works. Assume the reader has little experience in HDFS and searching methods. The reader is an experienced Python programmer. In other words, I am not interested in how Python works but in how the application work. => Include diagrams (UML, PowerPoint, draw.io, etc.) and screenshots to support your writings. For example show me how this program interact with the HDFS file system and/or the node manager, resource manager, etc. =>Provide critical (not nasty or unprofessional) evaluation of the code and propose a better solution. Please do not just say the code is good and I have no comment! That will not get you a very good grade. Provide me with three (3) possible improvements.

38 of index maker.py u til.py do readme.md #!/usr/bin/python3 from pyspark import Sparkcontext, Sparkconf import util import collections import sys import os import shutil "For a given input directory that has file with an extension of .txt create the same file with an extention of .idx store in the output directory class IndexMaker(): definit__(self): self.conf = Sparkconf().setAppName("Index Maker App) self.sc = SparkContext(conf=self.conf) def word count (self, file name): self.index_by_word = {} self text = self.sc.textFile(file_name) self.counts = self.text \ .flatmap(lambda line: util.strip punc(line) .map(lambda word: (word. lower(), 1)) \ . reduceByKey (lambda a, b: a + b) def make_index(self): for items in self.counts.collect(): if items[0].strip != '': self.index_by_word[items [0]] = [] line no = 1 for line in self.text.collect(): line = util.strip punc(line) for word in line: if word.strip() != ": self.index_by_word[word. Lower()].append(line_no) line_no += 1 def write index(self, local_file name) : if os.path.exists(local_file_name): os.remove(local file name) local_file = open(local_file name, 'W') self.index by word = collections.OrderedDict(sorted(self.index by word. items())) for word in self.index_by_word: local file.write(word + ' ' + str(self.index_by word[word]) + In astaticmethod def prepare_dirs(argv): in dir = argv[0] out_dir = argv[1] Local dir = '/tmp/id index_maker.py local file.write(word + ' ' + str(self.index by word[word]) + n) @staticmethod def prepare_dirs(argv): in dir = argv[O] out_dir = argv[1] local dir = '/tmp/idx/ if os.path.exists(local_dir): shutil.rmtree(local_dir) os.mkdir(local dir) util.delete_out_dir(out_dir) in file names = util.get_file_names (in dir) out_file_names = [] for file_name in in file_names: out_file_names.append(file name.replace('.txt', '.idx')) return in dir, out_dir, in file names, out_file_names, local dir @staticmethod def prepare file_names(in dir, local dir, in_file_name, out_file_name) : in_file_name = in dir + '/' + in_file_name local_file_name = Local dir + out_file_name print(' in:', in file_name) print('out:', local_file_name) return in_file_name, local_file_name @staticmethod def main(argv): in dir, out_dir, in file names, out_file names, local_dir = IndexMaker.prepare_dirs(argv) im = IndexMaker() for i in range(0, len(in_file_names)): in_file_name, local_file_name = IndexMaker.prepare_file_names (in dir, local_dir, in_file_names[i], out_file_names[i]) im.word_count(in_file name) im.make_index im.write_index (local_file_name) print('inin') print(" in_dir:', in dir) print("out_dir:', out_dir) print('local_dir:', local_dir) util.copy_from_local (local_dir, out_dir) name == main : IndexMaker.main(sys.argv[1:1) readme.md index maker.py u til.py sq!/usr/bin/python3 import string import re import subprocess def delete_out_dir(out_dir): subprocess.call(["hdfs", "dfs", "-rm", "-R", out_dir]) def copy_from local(local_files, remote dir): print(local_files, remote_dir) subprocess.call(["hdfs", "dfs". "-copyFromLocal", local files, remote dir]) def get_file_names (in_dir): buf = file names = [] with subprocess. Popen ( ["hdfs", "dfs", "-Is", in dir], stdout=subprocess.PIPE) as proc: buf t= re. sub'It', '', str(proc.stdout.read()); for line in buf.split('lin): tokens = [] for token in line.split(' '): if token.strip() != '': tokens.append(token) if len(tokens)

Accepted Answer

The Answer is in the image, click to view ...

Question

The activities for this part are as follows: 1. Review the program closely and make sure you understand the program fully. 2. Use MS WORD

Step by Step Solution

Step: 1

Get Instant Access to Expert-Tailored Solutions

Step: 2

Step: 3

Ace Your Homework with AI

Recommended Textbook for

Essentials of Database Management

Students also viewed these Databases questions

Question

Question

Question

Question

Question

Question

Question

Question

Question

Question

Question

Question

Question

Question