Answered step by step
Verified Expert Solution
Question
1 Approved Answer
what is wrong with my code and correct it : # Import necessary libraries import re import pandas as pd import string import numpy as
what is wrong with my code and correct it : # Import necessary libraries
import re
import pandas as pd
import string
import numpy as np
from nltkcorpus import stopwords
from nltktokenize import wordtokenize
from nltkstem.porter import PorterStemmer
from nltkstem import WordNetLemmatizer # Move this import statement up
from sklearn.modelselection import traintestsplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.featureextraction.text import TfidfVectorizer
from gensim.models import WordVec
from sklearn.preprocessing import LabelEncoder
def datacleaningtext:
if isinstancetext str:
tokens wordtokenizetext
# Normalization
normalizedtokens tokenlower for token in tokens
# Noise removal remove nonalphabetic characters
cleanedtokens resubrazAZ token for token in normalizedtokens
# Stemming
stemmer PorterStemmer
stemmedtokens stemmerstemtoken for token in cleanedtokens
# Lemmatization
lemmatizer WordNetLemmatizer
lemmatizedtokens lemmatizerlemmatizetoken for token in cleanedtokens
# Stopword removal
stopwords setstopwordswordsenglish
filteredtokens token for token in lemmatizedtokens if token not in stopwords
# Join the tokens into a string
resulttext joinfilteredtokens
return resulttext
def trainwordvecmodelsentences:
model WordVecsentences vectorsize window mincount workers
return model
def getwordembeddingstext model:
words wordtokenizetext
embeddings modelwvword for word in words if word in model.wv
if not embeddings:
# If no valid word found in the model, return zeros
return model.vectorsize
# Flatten the list of vectors into a D array
flatembedding val for sublist in embeddings for val in sublist
return flatembedding
def datacleaningpipelinedfpath, dfbody, dftitle:
df pdreadcsvdfpath
# Drop rows where 'issuebody' or 'issuelabel' is null
df dfdropnasubsetdfbody, 'issuelabel'
X dfdfbody, dftitle
y dfissuelabel"
# Convert categorical labels to numeric using LabelEncoder
labelencoder LabelEncoder
y labelencoder.fittransformy
Xtrain, Xtest, ytrain, ytest traintestsplitX y testsize randomstate
# Apply data cleaning to Xtrain and Xtest
Xtraindfbody Xtraindfbodyapplylambda text: datacleaningtext
Xtraindftitle Xtraindftitleapplylambda text: datacleaningtext
Xtestdfbody Xtestdfbodyapplylambda text: datacleaningtext
Xtestdftitle Xtestdftitleapplylambda text: datacleaningtext
# Train WordVec model on the training data
sentences wordtokenizetext for text in Xtraindfbody
wordvecmodel trainwordvecmodelsentences
# Apply WordVec embeddings to Xtrain
Xtraindfbody Xtraindfbodyapplylambda text: getwordembeddingstext wordvecmodel
Xtraindftitle Xtraindftitleapplylambda text: getwordembeddingstext wordvecmodel
# Apply WordVec embeddings to Xtest
Xtestdfbody Xtestdfbodyapplylambda text: getwordembeddingstext wordvecmodel
Xtestdftitle Xtestdftitleapplylambda text: getwordembeddingstext wordvecmodel
return Xtrain, Xtest, ytrain, ytest
Xtrain, Xtest, ytrain, ytest datacleaningpipelineC:UserslyndaDocumentsWinterPrincipals of AI engneeringsamplesamplecsv
'issuebody',
'issuetitle'
# Xtrain.info
# ytrain.info
# Rest of your code...
printShapes after preprocessing:"
printXtrain shape:", Xtrain.shape
printytrain shape:", ytrain.shape
printXtest shape:", Xtest.shape
printytest shape:", ytest.shape
printTypes after preprocessing:"
printXtrain type:", typeXtrain
printXtest type:", typeXtest
printytrain type:", typeytrain
printytest type:", typeytestfrom sklearn.experimental import enablehalvingsearchcv
from sklearn.modelselection import GridSearchCV
from sklearn.modelselection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.modelselection import HalvingGridSearchCV
from sklearn.modelselection import traintestsplit
from sklearn.metrics import accuracyscore
import pandas as pd
import numpy as np
rf RandomForestClassifier
rffitXtrain, ytrain
ypred rfpredictXtest
# Evaluate the model
accuracy accuracyscoreytest, ypred
printAccuracy: accuracy
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started