import urllib.request, json sms_corpus = [] with urllib.request.urlopen(https://storage.googleapis.com/wd13/SMSSpamCollection.txt) as url: for line in url.readlines(): sms_corpus.append(line.decode().split('t')) # print
Question:
import urllib.request, json
sms_corpus = []
with urllib.request.urlopen("https://storage.googleapis.com/wd13/SMSSpamCollection.txt") as url:
for line in url.readlines():
sms_corpus.append(line.decode().split('\t'))
# print the text and label of document 16
docid = 16
print(sms_corpus[docid])
# print the label of document 16
docid = 16
print(sms_corpus[docid][0])
# print the text of document 16
docid = 16
print(sms_corpus[docid][1])
# TOKENIZER WITH LEMMATIZER & STOPWORDS #
#importing nltk libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
def tokenize(doc):
punctuation = ['.', ',', ';', ':', '!', '?', '(', ')', '{', '}', '"', '\'']
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Tokenize the document into words
words = word_tokenize(doc)
# Convert to lowercase, remove punctuation, lemmatize, and remove stopwords
words = [lemmatizer.lemmatize(word.lower()) for word in words
if word not in punctuation and word.lower() not in stop_words]
return words
# testing the tokenizer on the doc id's text
docid = 14
print(sms_corpus[docid][0]) #label
print(tokenize(sms_corpus[docid][1])) #tokenized document
from math import log
log(1)
-----------------------------
QUESTIONS:
Q1: I provided the codes above and I used below code to Calculate scores (Computing TF-IDF scores) for every token in the corpus. Store these scores in a dictionary called token_scores.
My problem is it get tokenized by sentence and not word (check the output). How can I calculate by word and using array. Kindly explain too please.
token_scores = {}
def calculate_token_scores(tokenized_docs):
token_scores = {}
# Count the frequency of each token in the corpus
token_counts = {}
for doc in tokenized_docs:
for token in doc:
if token in token_counts:
token_counts[token] += 1
else:
token_counts[token] = 1
# Calculate the token scores based on their frequency
total_docs = len(tokenized_docs)
for token, count in token_counts.items():
token_scores[token] = log(total_docs / count)
return token_scores
calculate_token_scores(sms_corpus)
Q2: What tokens are most predictive of a message being SPAM? Can you please explain
Introduction to Java Programming, Comprehensive Version
ISBN: 978-0133761313
10th Edition
Authors: Y. Daniel Liang