import urllib.request, json sms_corpus = [] with urllib.request.urlopen(https://storage.googleapis.com/wd13/SMSSpamCollection.txt) as url: for line in url.readlines(): sms_corpus.append(line.decode().split('t')) # print

Fantastic news! We've Found the answer you've been seeking!

Question:

import urllib.request, json

sms_corpus = []

with urllib.request.urlopen("https://storage.googleapis.com/wd13/SMSSpamCollection.txt") as url:

for line in url.readlines():

sms_corpus.append(line.decode().split('\t'))

# print the text and label of document 16

docid = 16

print(sms_corpus[docid])

# print the label of document 16

docid = 16

print(sms_corpus[docid][0])

# print the text of document 16

docid = 16

print(sms_corpus[docid][1])

# TOKENIZER WITH LEMMATIZER & STOPWORDS #

#importing nltk libraries

import nltk

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

nltk.download('punkt')

nltk.download('wordnet')

nltk.download('stopwords')

def tokenize(doc):

punctuation = ['.', ',', ';', ':', '!', '?', '(', ')', '{', '}', '"', '\'']

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

# Tokenize the document into words

words = word_tokenize(doc)

# Convert to lowercase, remove punctuation, lemmatize, and remove stopwords

words = [lemmatizer.lemmatize(word.lower()) for word in words

if word not in punctuation and word.lower() not in stop_words]

return words

# testing the tokenizer on the doc id's text

docid = 14

print(sms_corpus[docid][0]) #label

print(tokenize(sms_corpus[docid][1])) #tokenized document

from math import log

log(1)

-----------------------------

QUESTIONS:

Q1: I provided the codes above and I used below code to Calculate scores (Computing TF-IDF scores) for every token in the corpus. Store these scores in a dictionary called token_scores.

My problem is it get tokenized by sentence and not word (check the output). How can I calculate by word and using array. Kindly explain too please.

token_scores = {}

def calculate_token_scores(tokenized_docs):

token_scores = {}

# Count the frequency of each token in the corpus

token_counts = {}

for doc in tokenized_docs:

for token in doc:

if token in token_counts:

token_counts[token] += 1

else:

token_counts[token] = 1

# Calculate the token scores based on their frequency

total_docs = len(tokenized_docs)

for token, count in token_counts.items():

token_scores[token] = log(total_docs / count)

return token_scores

calculate_token_scores(sms_corpus)

Q2: What tokens are most predictive of a message being SPAM? Can you please explain

Related Book For answer-question

Introduction to Java Programming, Comprehensive Version

ISBN: 978-0133761313

10th Edition

Authors: Y. Daniel Liang

See More Books

Posted Date: Mar 19, 2024 05:02 AM

import urllib.request, json sms_corpus = [] with urllib.request.urlopen(https://storage.googleapis.com/wd13/SMSSpamCollection.txt) as url: for line in url.readlines(): sms_corpus.append(line.decode().split('t')) # print

Question:

Expert Answer:

Introduction to Java Programming, Comprehensive Version

Students also viewed these programming questions