Question
# BEGIN - DO NOT CHANGE THESE IMPORTS/CONSTANTS OR IMPORT ADDITIONAL PACKAGES. from torch.utils.data import Dataset # END - DO NOT CHANGE THESE IMPORTS/CONSTANTS OR
# BEGIN - DO NOT CHANGE THESE IMPORTS/CONSTANTS OR IMPORT ADDITIONAL PACKAGES.
from torch.utils.data import Dataset
# END - DO NOT CHANGE THESE IMPORTS/CONSTANTS OR IMPORT ADDITIONAL PACKAGES.
# HeadlineDataset
# This class takes a Pandas DataFrame and wraps in a Torch Dataset.
# Read more about Torch Datasets here:
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
#
class HeadlineDataset(Dataset):
# initialize this class with appropriate instance variables
def __init__(self, vocab, df, max_length=50):
# For this method: We would *strongly* recommend storing the dataframe
# itself as an instance variable, and keeping this method
# very simple. Leave processing to __getitem__.
#
# Sometimes, however, it does make sense to preprocess in
# __init__. If you are curious as to why, read the aside at the
# bottom of this cell.
#
## YOUR CODE STARTS HERE (~3 lines of code) ##
return
## YOUR CODE ENDS HERE ##
# return the length of the dataframe instance variable
def __len__(self):
df_len = None
## YOUR CODE STARTS HERE (1 line of code) ##
## YOUR CODE ENDS HERE ##
return df_len
# __getitem__
#
# Converts a dataframe row (row["tokenized"]) to an encoded torch LongTensor,
# using our vocab map created using generate_vocab_map. Restricts the encoded
# headline length to max_length.
#
# The purpose of this method is to convert the row - a list of words - into
# a corresponding list of numbers.
#
# i.e. using a map of {"hi": 2, "hello": 3, "UNK": 0}
# this list ["hi", "hello", "NOT_IN_DICT"] will turn into [2, 3, 0]
#
# returns:
# tokenized_word_tensor - torch.LongTensor
# A 1D tensor of type Long, that has each
# token in the dataframe mapped to a number.
# These numbers are retrieved from the vocab_map
# we created in generate_vocab_map.
#
# **IMPORTANT**: if we filtered out the word
# because it's infrequent (and it doesn't exist
# in the vocab) we need to replace it w/ the UNK
# token
#
# curr_label - int
# Binary 0/1 label retrieved from the DataFrame.
#
def __getitem__(self, index: int):
tokenized_word_tensor = None
curr_label = None
## YOUR CODE STARTS HERE (~3-7 lines of code) ##
## YOUR CODE ENDS HERE ##
return tokenized_word_tensor, curr_label
#
# Completely optional aside on preprocessing in __init__.
#
# Sometimes the compute bottleneck actually ends up being in __getitem__.
# In this case, you'd loop over your dataset in __init__, passing data
# to __getitem__ and storing it in another instance variable. Then,
# you can simply return the preprocessed data in __getitem__ instead of
# doing the preprocessing.
#
# There is a tradeoff though: can you think of one?
#
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started