what is wrong with my code and correct it Import necessary libraries import re import pandas as pd import string import numpy as np from nltk corpus import stopwords from nltk tokenize import word tokenize from nltk stem porter import PorterStemmer from nltk stem import WordNetLemmatizer Move this import statement up from sklearn model selection import train test split from sklearn ensemble import RandomForestClassifier from sklearn pipeline import Pipeline from sklearn feature extraction text import TfidfVectorizer from gensim models import Word 2 Vec from sklearn preprocessing import LabelEncoder def datacleaning ( text ) if isinstance ( text , str ) tokens word tokenize ( text ) Normalization normalized tokens token lower ( ) for token in tokens Noise removal ( remove non alphabetic characters ) cleaned tokens re sub ( r ' a zA Z ' , ' ' , token ) for token in normalized tokens Stemming stemmer PorterStemmer ( ) stemmed tokens stemmer stem ( token ) for token in cleaned tokens Lemmatization lemmatizer WordNetLemmatizer ( ) lemmatized tokens lemmatizer lemmatize ( token ) for token in cleaned tokens Stop word removal stop words set ( stopwords words ( ' english ' ) ) filtered tokens token for token in lemmatized tokens if token not in stop words Join the tokens into a string result text ' ' join ( filtered tokens ) return result text def train word 2 vec model ( sentences ) model Word 2 Vec ( sentences , vector size 1 0 0 , window 5 , min count 1 , workers 4 ) return model def get word embeddings ( text , model ) words word tokenize ( text ) embeddings model wv word for word in words if word in model wv if not embeddings If no valid word found in the model, return zeros return 0 0 model vector size Flatten the list of vectors into a 1 D array flat embedding val for sublist in embeddings for val in sublist return flat embedding def data cleaning pipeline ( df path, df body, df title ) df pd read csv ( df path ) Drop rows where 'issue body' or 'issue label' is null df df dropna ( subset df body, 'issue label' ) X df df body, df title y df issue label Convert categorical labels to numeric using LabelEncoder label encoder LabelEncoder ( ) y label encoder fit transform ( y ) X train, X test, y train, y test train test split ( X , y , test size 0 3 , random state 4 2 ) Apply data cleaning to X train and X test X train df body X train df body apply ( lambda text datacleaning ( text ) ) X train df title X train df title apply ( lambda text datacleaning ( text ) ) X test df body X test df body apply ( lambda text datacleaning ( text ) ) X test df title X test df title apply ( lambda text datacleaning ( text ) ) Train Word 2 Vec model on the training data sentences word tokenize ( text ) for text in X train df body word 2 vec model train word 2 vec model ( sentences ) Apply Word 2 Vec embeddings to X train X train df body X train df body apply ( lambda text get word embeddings ( text , word 2 vec model ) ) X train df title X train df title apply ( lambda text get word embeddings ( text , word 2 vec model ) ) Apply Word 2 Vec embeddings to X test X test df body X test df body apply ( lambda text get word embeddings ( text , word 2 vec model ) ) X test df title X test df title apply ( lambda text get word embeddings ( text , word 2 vec model ) ) return X train, X test, y train, y test X train, X test, y train, y test data cleaning pipeline ( ' C Users lynda Documents Winter Principals of AI engneering sample 1 sample 1 csv ' , 'issue body', 'issue title' ) X train info ( ) y train info ( ) Rest of your code print ( Shapes after preprocessing ) print ( X train shape , X train shape ) print ( y train shape , y train shape ) print ( X test shape , X test shape ) print ( y test shape , y test shape ) print ( Types after preprocessing ) print ( X train type , type ( X train ) ) print ( X test type , type ( X test ) ) print ( y train type , type ( y train ) ) print ( y test type , type ( y test ) ) from sklearn experimental import enable halving search cv from sklearn model selection import GridSearchCV from sklearn model selection import RandomizedSearchCV from sklearn ensemble import RandomForestClassifier from sklearn model selection import HalvingGridSearchCV from sklearn model selection import train test split from sklearn metrics import accuracy score import pandas as pd import numpy as np rf RandomForestClassifier ( ) rf fit ( X train, y train ) y pred rf predict ( X test ) Evaluate the model accuracy accuracy score ( y test, y pred ) print ( Accuracy , accuracy )

The Answer is in the image, click to view ...

Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Sep 21, 2024

what is wrong with my code and correct it : # Import necessary libraries import re import pandas as pd import string import numpy as

what is wrong with my code and correct it : # Import necessary libraries

import re

import pandas as pd

import string

import numpy as np

from nltk

.

corpus import stopwords

from nltk

.

tokenize import word

_

tokenize

from nltk

.

stem.porter import PorterStemmer

from nltk

.

stem import WordNetLemmatizer # Move this import statement up

from sklearn.model

_

selection import train

_

test

_

split

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn.feature

_

extraction.text import TfidfVectorizer

from gensim.models import Word

2

Vec

from sklearn.preprocessing import LabelEncoder

def datacleaning

(

text

)

if isinstance

(

text

,

str

)

tokens

=

word

_

tokenize

(

text

)

# Normalization

normalized

_

tokens

= [

token

.

lower

()

for token in tokens

]

# Noise removal

(

remove non

-

alphabetic characters

)

cleaned

_

tokens

= [

.

sub

(

' [^

-

-

]','',

token

)

for token in normalized

_

tokens

]

# Stemming

stemmer

=

PorterStemmer

()

stemmed

_

tokens

= [

stemmer

.

stem

(

token

)

for token in cleaned

_

tokens

]

# Lemmatization

lemmatizer

=

WordNetLemmatizer

()

lemmatized

_

tokens

= [

lemmatizer

.

lemmatize

(

token

)

for token in cleaned

_

tokens

]

# Stop

-

word removal

stop

_

words

=

set

(

stopwords

.

words

('

english

'))

filtered

_

tokens

= [

token for token in lemmatized

_

tokens if token not in stop

_

words

]

# Join the tokens into a string

result

_

text

='' .

join

(

filtered

_

tokens

)

return result

_

text

def train

_

word

2

vec

_

model

(

sentences

)

model

=

Word

2

Vec

(

sentences

,

vector

_

size

= 100,

window

= 5,

min

_

count

= 1,

workers

= 4)

return model

def get

_

word

_

embeddings

(

text

,

model

)

words

=

word

_

tokenize

(

text

)

embeddings

= [

model

.

[

word

]

for word in words if word in model.wv

]

if not embeddings:

# If no valid word found in the model, return zeros

return

[0.0] *

model.vector

_

size

# Flatten the list of vectors into a

1

D array

flat

_

embedding

= [

val for sublist in embeddings for val in sublist

]

return flat

_

embedding

def data

_

cleaning

_

pipeline

(

_

path, df

_

body, df

_

title

)

=

.

read

_

csv

(

_

path

)

# Drop rows where 'issue

_

body' or 'issue

_

label' is null

=

.

dropna

(

subset

= [

_

body, 'issue

_

label'

])

=

[[

_

body, df

_

title

]]

=

["

issue

_

label"

]

# Convert categorical labels to numeric using LabelEncoder

label

_

encoder

=

LabelEncoder

()

=

label

_

encoder.fit

_

transform

(

)

_

train, X

_

test, y

_

train, y

_

test

=

train

_

test

_

split

(

,

,

test

_

size

= 0.3,

random

_

state

= 42)

# Apply data cleaning to X

_

train and X

_

test

_

train

[

_

body

] =

_

train

[

_

body

] .

apply

(

lambda text: datacleaning

(

text

))

_

train

[

_

title

] =

_

train

[

_

title

] .

apply

(

lambda text: datacleaning

(

text

))

_

test

[

_

body

] =

_

test

[

_

body

] .

apply

(

lambda text: datacleaning

(

text

))

_

test

[

_

title

] =

_

test

[

_

title

] .

apply

(

lambda text: datacleaning

(

text

))

# Train Word

2

Vec model on the training data

sentences

= [

word

_

tokenize

(

text

)

for text in X

_

train

[

_

body

]]

word

2

vec

_

model

=

train

_

word

2

vec

_

model

(

sentences

)

# Apply Word

2

Vec embeddings to X

_

train

_

train

[

_

body

] =

_

train

[

_

body

] .

apply

(

lambda text: get

_

word

_

embeddings

(

text

,

word

2

vec

_

model

))

_

train

[

_

title

] =

_

train

[

_

title

] .

apply

(

lambda text: get

_

word

_

embeddings

(

text

,

word

2

vec

_

model

))

# Apply Word

2

Vec embeddings to X

_

test

_

test

[

_

body

] =

_

test

[

_

body

] .

apply

(

lambda text: get

_

word

_

embeddings

(

text

,

word

2

vec

_

model

))

_

test

[

_

title

] =

_

test

[

_

title

] .

apply

(

lambda text: get

_

word

_

embeddings

(

text

,

word

2

vec

_

model

))

return X

_

train, X

_

test, y

_

train, y

_

test

_

train, X

_

test, y

_

train, y

_

test

=

data

_

cleaning

_

pipeline

('

/

Users

/

lynda

/

Documents

/

Winter

/

Principals of AI engneering

/

sample

1 /

sample

1 .

csv

',

'issue

_

body',

'issue

_

title'

)

# X

_

train.info

()

# y

_

train.info

()

# Rest of your code...

("

Shapes after preprocessing:"

)

("

_

train shape:", X

_

train.shape

)

("

_

train shape:", y

_

train.shape

)

("

_

test shape:", X

_

test.shape

)

("

_

test shape:", y

_

test.shape

)

("

Types after preprocessing:"

)

("

_

train type:", type

(

_

train

))

("

_

test type:", type

(

_

test

))

("

_

train type:", type

(

_

train

))

("

_

test type:", type

(

_

test

))

from sklearn.experimental import enable

_

halving

_

_

from sklearn.model

_

selection import GridSearchCV

from sklearn.model

_

selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.model

_

selection import HalvingGridSearchCV

from sklearn.model

_

selection import train

_

test

_

split

from sklearn.metrics import accuracy

_

score

import pandas as pd

import numpy as np

=

RandomForestClassifier

()

.

fit

(

_

train, y

_

train

)

_

pred

=

.

predict

(

_

test

)

# Evaluate the model

accuracy

=

accuracy

_

score

(

_

test, y

_

pred

)

("

Accuracy:

",

accuracy

)

Step by Step Solution

There are 3 Steps involved in it

Step: 1

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

Step: 3

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Database Systems For Advanced Applications Dasfaa 2022 International Workshops Bdms Bdqm Gdma Iwbt Maqtds And Pmbd Virtual Event April 11 14 2022 Proceedings Lncs 13248

Authors: Uday Kiran Rage ,Vikram Goyal ,P. Krishna Reddy

1st Edition

★★★★★

Technology.Send a brief e-mail to your instructor explaining why you agree or disagree with the statement Jargon is technical slang. (Objective 5)

Answered: 1 week ago

Previous Question Next Question