Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Aug 29, 2024

add read and write methods to the DictDocumentCollection class above similar to those in TransformedDocument class (except for Documents not for TransformedDocuments) in indexing_process written

add read and write methods to the DictDocumentCollection class above similar to those in TransformedDocument class (except for Documents not for TransformedDocuments) in indexing_process written during the lectures.

indexing_process.py

import json

import typing

class Document(typing.NamedTuple):

doc_id: str

text: str

class DocumentCollection:

def __init__(self):

self.docs: typing.List[Document] = []

def add_document(self, doc: Document):

self.docs.append(doc)

def get_all_docs(self) -> typing.List[Document]:

return self.docs

class TransformedDocument(typing.NamedTuple):

doc_id: str

tokens: typing.List[str]

class TransformedDocumentCollection:

def __init__(self):

self.docs: typing.List[TransformedDocument] = []

def add_document(self, doc: TransformedDocument):

self.docs.append(doc)

def write(self, path: str):

json_data = {'docs': [td._asdict() for td in self.docs]}

with open(path, 'w') as fp:

json.dump(obj=json_data, fp=fp)

@staticmethod

def read(path: str) -> 'TransformedDocumentCollection':

out = TransformedDocumentCollection()

with open(path) as fp:

collection_dict = json.load(fp)

doc_records = collection_dict['docs']

for record in doc_records:

doc = TransformedDocument(doc_id=record['doc_id'], tokens=record['tokens'])

out.add_document(doc)

return out

class Index:

pass

class WikiSource:

DEFAULT_PATH = r'C:\Users\Alex\Documents\DePaul\datasets\wiki_small\wiki_small.json'

def read_documents(self, data_file_path: str = DEFAULT_PATH) -> DocumentCollection:

with open(data_file_path) as fp:

doc_records = json.load(fp)

doc_collection = DocumentCollection()

for record in doc_records:

doc = Document(doc_id=record['id'], text=record['init_text'])

doc_collection.add_document(doc)

return doc_collection

def tokenize(document_text: str) -> typing.List[str]:

return document_text.lower().split()

def transform_documents(document_collection: DocumentCollection) -> TransformedDocumentCollection:

docs = document_collection.get_all_docs()

out = TransformedDocumentCollection()

for d in docs:

tokens = tokenize(d.text)

transformed_doc = TransformedDocument(doc_id=d.doc_id, tokens=tokens)

out.add_document(transformed_doc)

return out

def create_index(transformed_documents):

pass

def indexing_process(document_source: WikiSource) -> (DocumentCollection, Index):

document_collection = document_source.read_documents()

transformed_documents = transform_documents(document_collection)

# transformed_documents.write(path='')

index = create_index(transformed_documents)

return (document_collection, index)

Step by Step Solution

There are 3 Steps involved in it

Step: 1

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

Step: 3

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Focus On Geodatabases In ArcGIS Pro

Authors: David W. Allen

1st Edition

★★★★★

Over a 24-hour period, observe and record oral and written communication that you would classify as negative messages because they are unpleasant, disappointing, or upsetting to the receiver. Record...

Answered: 1 week ago

Previous Question Next Question