Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

what is wrong with my code and correct it : # Import necessary libraries import re import pandas as pd import string import numpy as

what is wrong with my code and correct it : # Import necessary libraries
import re
import pandas as pd
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer # Move this import statement up
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
def datacleaning(text):
if isinstance(text, str):
tokens = word_tokenize(text)
# Normalization
normalized_tokens =[token.lower() for token in tokens]
# Noise removal (remove non-alphabetic characters)
cleaned_tokens =[re.sub(r'[^a-zA-Z]','', token) for token in normalized_tokens]
# Stemming
stemmer = PorterStemmer()
stemmed_tokens =[stemmer.stem(token) for token in cleaned_tokens]
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens =[lemmatizer.lemmatize(token) for token in cleaned_tokens]
# Stop-word removal
stop_words = set(stopwords.words('english'))
filtered_tokens =[token for token in lemmatized_tokens if token not in stop_words]
# Join the tokens into a string
result_text =''.join(filtered_tokens)
return result_text
def train_word2vec_model(sentences):
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
return model
def get_word_embeddings(text, model):
words = word_tokenize(text)
embeddings =[model.wv[word] for word in words if word in model.wv]
if not embeddings:
# If no valid word found in the model, return zeros
return [0.0]* model.vector_size
# Flatten the list of vectors into a 1D array
flat_embedding =[val for sublist in embeddings for val in sublist]
return flat_embedding
def data_cleaning_pipeline(df_path, df_body, df_title):
df = pd.read_csv(df_path)
# Drop rows where 'issue_body' or 'issue_label' is null
df = df.dropna(subset=[df_body, 'issue_label'])
X = df[[df_body, df_title]]
y = df["issue_label"]
# Convert categorical labels to numeric using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Apply data cleaning to X_train and X_test
X_train[df_body]= X_train[df_body].apply(lambda text: datacleaning(text))
X_train[df_title]= X_train[df_title].apply(lambda text: datacleaning(text))
X_test[df_body]= X_test[df_body].apply(lambda text: datacleaning(text))
X_test[df_title]= X_test[df_title].apply(lambda text: datacleaning(text))
# Train Word2Vec model on the training data
sentences =[word_tokenize(text) for text in X_train[df_body]]
word2vec_model = train_word2vec_model(sentences)
# Apply Word2Vec embeddings to X_train
X_train[df_body]= X_train[df_body].apply(lambda text: get_word_embeddings(text, word2vec_model))
X_train[df_title]= X_train[df_title].apply(lambda text: get_word_embeddings(text, word2vec_model))
# Apply Word2Vec embeddings to X_test
X_test[df_body]= X_test[df_body].apply(lambda text: get_word_embeddings(text, word2vec_model))
X_test[df_title]= X_test[df_title].apply(lambda text: get_word_embeddings(text, word2vec_model))
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = data_cleaning_pipeline('C:/Users/lynda/Documents/Winter/Principals of AI engneering/sample1/sample1.csv',
'issue_body',
'issue_title')
# X_train.info()
# y_train.info()
# Rest of your code...
print("Shapes after preprocessing:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("Types after preprocessing:")
print("X_train type:", type(X_train))
print("X_test type:", type(X_test))
print("y_train type:", type(y_train))
print("y_test type:", type(y_test))from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

More Books

Students also viewed these Databases questions