Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

Implement a new class called DictDocumentCollection that is similar to DocumentCollection class in indexing_process written during the lectures, but uses a dict as an underlying

Implement a new class called DictDocumentCollection that is similar to DocumentCollection class in indexing_process written during the lectures, but uses a dict as an underlying storage data structure instead of a list. The dict should have doc_ids from the Documents as keys and Documents themselves as values. I.e. the following code:

docs = DictDocumentCollection() docs.add_document(Document(id='1', text='text1')) docs.add_document(Document(id='2', text='text2'))

Should result in docs.docs being equal to {'1': Document(id='1', text='text1'), '2': Document(id='2', text='text2')}Write __init__, add_document, get_all_docs methods and write unittest similar to the code above.

Indexing_process.py

import json

import typing

class Document(typing.NamedTuple):

doc_id: str

text: str

class DocumentCollection:

def __init__(self):

self.docs: typing.List[Document] = []

def add_document(self, doc: Document):

self.docs.append(doc)

def get_all_docs(self) -> typing.List[Document]:

return self.docs

class TransformedDocument(typing.NamedTuple):

doc_id: str

tokens: typing.List[str]

class TransformedDocumentCollection:

def __init__(self):

self.docs: typing.List[TransformedDocument] = []

def add_document(self, doc: TransformedDocument):

self.docs.append(doc)

def write(self, path: str):

json_data = {'docs': [td._asdict() for td in self.docs]}

with open(path, 'w') as fp:

json.dump(obj=json_data, fp=fp)

@staticmethod

def read(path: str) -> 'TransformedDocumentCollection':

out = TransformedDocumentCollection()

with open(path) as fp:

collection_dict = json.load(fp)

doc_records = collection_dict['docs']

for record in doc_records:

doc = TransformedDocument(doc_id=record['doc_id'], tokens=record['tokens'])

out.add_document(doc)

return out

class Index:

pass

class WikiSource:

DEFAULT_PATH = r'C:\Users\Alex\Documents\DePaul\datasets\wiki_small\wiki_small.json'

def read_documents(self, data_file_path: str = DEFAULT_PATH) -> DocumentCollection:

with open(data_file_path) as fp:

doc_records = json.load(fp)

doc_collection = DocumentCollection()

for record in doc_records:

doc = Document(doc_id=record['id'], text=record['init_text'])

doc_collection.add_document(doc)

return doc_collection

def tokenize(document_text: str) -> typing.List[str]:

return document_text.lower().split()

def transform_documents(document_collection: DocumentCollection) -> TransformedDocumentCollection:

docs = document_collection.get_all_docs()

out = TransformedDocumentCollection()

for d in docs:

tokens = tokenize(d.text)

transformed_doc = TransformedDocument(doc_id=d.doc_id, tokens=tokens)

out.add_document(transformed_doc)

return out

def create_index(transformed_documents):

pass

def indexing_process(document_source: WikiSource) -> (DocumentCollection, Index):

document_collection = document_source.read_documents()

transformed_documents = transform_documents(document_collection)

# transformed_documents.write(path='')

index = create_index(transformed_documents)

return (document_collection, index)

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Select Healthcare Classification Systems And Databases

Authors: Katherine S. Rowell, Ann Cutrell

1st Edition

0615909760, 978-0615909769

More Books

Students also viewed these Databases questions

Question

what we mean by a "process"

Answered: 1 week ago