Question
The code is in Python ranking.py import json import typing def term_count(query: str, document: str) -> int: count = 0 query_terms = query.lower().split() document_terms =
The code is in Python
ranking.py
import json
import typing
def term_count(query: str, document: str) -> int:
count = 0
query_terms = query.lower().split()
document_terms = document.lower().split()
for query_term in query_terms:
for document_term in document_terms:
if query_term == document_term:
count += 1
return count
def boolean_term_count(query: str, document: str) -> int:
count = 0
query_terms = query.lower().split()
document_terms = document.lower().split()
for term in query_terms:
if term in document_terms:
count += 1
return count
def search(query: str, documents: typing.List[str]) -> typing.List[str]:
counts = dict()
for i, doc in enumerate(documents):
counts[i] = term_count(query=query, document=doc)
indexes = sorted(range(len(documents)), key=counts.get, reverse=True)
return [documents[i] for i in indexes]
def run_search():
with open(r'C:\Users\Alex\Documents\DePaul\datasets\wiki_small\wiki_small.json') as fp:
data = json.load(fp)
documents = [record['init_text'] for record in data]
query = input("Please enter a query:")
while query:
print(search(query, documents))
query = input("Please enter a query:")
matching.py
import typing
def search(query: str, documents: typing.List[str]) -> typing.List[str]:
"""
Naive search implementation.
:param query: The text to search for.
:param documents: A list of strings representing documents that we are searching over.
:return: Documents matching the query.
"""
# The code in this function is equivalent to the following list comprehension:
# return [doc for doc in documents if boolean_term_match(query, doc)]
out = []
for doc in documents:
if boolean_term_match(query=query, document=doc):
out.append(doc)
return out
def string_match(query: str, document: str) -> bool:
"""
Implements document matching by checking if the query is a substring of the document.
:param query: The text a user searched for.
:param document: A candidate document.
:return: True if the document matches the query and False otherwise.
"""
return query in document
def boolean_term_match(query: str, document: str) -> bool:
"""
Boolean matching function.
:param query: The text a user searched for.
:param document: A candidate document.
:return: True if all terms in the query are also in the document and False otherwise.
"""
query_terms: typing.List[str] = query.lower().split()
document_terms: typing.List[str] = document.lower().split()
for term in query_terms:
if term not in document_terms:
return False
return True
indexing_process
import json
import typing
class Document(typing.NamedTuple):
doc_id: str
text: str
class DocumentCollection:
def __init__(self):
self.docs: typing.List[Document] = []
def add_document(self, doc: Document):
self.docs.append(doc)
def get_all_docs(self) -> typing.List[Document]:
return self.docs
class TransformedDocument(typing.NamedTuple):
doc_id: str
tokens: typing.List[str]
class TransformedDocumentCollection:
def __init__(self):
self.docs: typing.List[TransformedDocument] = []
def add_document(self, doc: TransformedDocument):
self.docs.append(doc)
def write(self, path: str):
json_data = {'docs': [td._asdict() for td in self.docs]}
with open(path, 'w') as fp:
json.dump(obj=json_data, fp=fp)
@staticmethod
def read(path: str) -> 'TransformedDocumentCollection':
out = TransformedDocumentCollection()
with open(path) as fp:
collection_dict = json.load(fp)
doc_records = collection_dict['docs']
for record in doc_records:
doc = TransformedDocument(doc_id=record['doc_id'], tokens=record['tokens'])
out.add_document(doc)
return out
class Index:
pass
class WikiSource:
DEFAULT_PATH = r'C:\Users\Alex\Documents\DePaul\datasets\wiki_small\wiki_small.json'
def read_documents(self, data_file_path: str = DEFAULT_PATH) -> DocumentCollection:
with open(data_file_path) as fp:
doc_records = json.load(fp)
doc_collection = DocumentCollection()
for record in doc_records:
doc = Document(doc_id=record['id'], text=record['init_text'])
doc_collection.add_document(doc)
return doc_collection
def tokenize(document_text: str) -> typing.List[str]:
return document_text.lower().split()
def transform_documents(document_collection: DocumentCollection) -> TransformedDocumentCollection:
docs = document_collection.get_all_docs()
out = TransformedDocumentCollection()
for d in docs:
tokens = tokenize(d.text)
transformed_doc = TransformedDocument(doc_id=d.doc_id, tokens=tokens)
out.add_document(transformed_doc)
return out
def create_index(transformed_documents):
pass
def indexing_process(document_source: WikiSource) -> (DocumentCollection, Index):
document_collection = document_source.read_documents()
transformed_documents = transform_documents(document_collection)
# transformed_documents.write(path='')
index = create_index(transformed_documents)
return (document_collection, index)
Problem 1: Please describe the main difference between the search implementation in ranking.py and in matching.py. What would you do if you were asked to produce just the best 10 results for a query in each implementation.
Please add all of the following to the indexing_process.py file we have worked on during the lectures
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started