Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

Need help with Logistic regression using gradient ascent algorithm on Wisconson Breast Cancer dataset ( uploaded at https://drive.google.com/file/d/1hoadOlDSvBa9T5nkeVGa9GRba3UDrQJZ/view?usp=drivesdk), the values should start from 0.08 to

Need help with Logistic regression using gradient ascent algorithm on Wisconson Breast Cancer dataset ( uploaded at https://drive.google.com/file/d/1hoadOlDSvBa9T5nkeVGa9GRba3UDrQJZ/view?usp=drivesdk), the values should start from 0.08 to 6 yet here it starts from 1.66 & thus ends at 0.549 with rest as Inf & NaN. Any help would be appreciaed.

Code -

cancerData = read.csv("breast-cancer-dataset.csv")

#DATA PREPARATION

X = as.matrix(cancerData[-c(2, 33)]) #X is 569 by 31 (segmented out the labels and a NA column)

X = cbind(as.vector(rep(1, 569)), X) #add a column of ones for intercept term -- X is now 569 by 32 (features matrix)

means_X = colMeans(X) #obtain mean of each feature

std_X = apply(X, 2, sd) #obtain standard deviation of each feature

#Apply mean normalization -- speeds up gradient descent and solves NaN issue

for (feature in 2:ncol(X)){ X[ , feature] = ( X[ , feature] - means_X[feature] ) / std_X[feature] }

labels = cancerData[2] #extract the labels we want to predict

numericLabels = ifelse(labels$diagnosis == 'M', 1, 0) #Malignant = 1, Benign = 0

y = as.matrix(numericLabels) #y is 569 by 1 (labels matrix)

set.seed(125)

sample = sample.int(n = nrow(X), size = floor(.75*nrow(X)), replace = F)

X_train = X[sample, ]

y_train = y[sample, ]

X_test = X[-sample, ]

y_test = y[-sample, ]

#COMPUTE COST AND GRADIENT

sigmoid = function(z)

{

g = 1/(1 + exp(-1 * z))

return(g)

}

cost = function(theta, X, y) #Cost function. Returns cost and gradients necessary to perform optimization.

{

m = NROW(y) #number of training examples

h = sigmoid(X %*% theta) #h is the hypothesis predictions.i.e,569 x 1.

J = (1/m) * ( t(y) %*% log(h) - t((1-y)) %*% log(1-h)) #Compute Cost

print(NROW(y))

grad = (1/m) * t(X) %*% (sigmoid(X %*% theta) - y) #Compute gradient

return(list("cost" = J, "gradient" = grad))

}

gradientAscent = function(X, y, theta, alpha, num_iterations) #Performs gradient ascent to maximize cost.

{

m = NROW(y) #number of training examples

J_history = matrix(0, num_iterations, 1) #initialize a variable to record cost at each iteration

for (iteration in 1:num_iterations) {

theta = theta + ( (alpha/m) * t(X) %*% (sigmoid(X %*% theta) - y) )

temp_cost = reg_cost(theta, X, y, 0)

J_history[iteration, 1] = temp_cost$cost

}

return(list("theta" = theta, "cost_history" = J_history))

}

reg_cost = function(theta, X, y, lambda) #Regularized cost function. Returns cost and gradients necessary to perform optimization.

{

m = NROW(y) #number of training examples

h = sigmoid(X %*% theta) #h is the hypothesis predictions.i.e, 569 x 1.

reg_term = (lambda/(2*m)) * sum( theta[2:NROW(theta)]^2 ) #compute regularization term

J = (1/m) * ( t(y) %*% log(h) - t((1-y)) %*% log(1-h)) + reg_term #Compute Cost

grad = as.vector(rep(0, 32)) #initialize gradient as 32 x 1 (same size as theta)

grad_reg_term = (lambda/m) * theta[2:NROW(theta)] #compute gradient regularization term

grad[1] = (1/m) * t(X[, 1]) %*% (h - y) #Compute gradient

grad[2:length(grad)] = ( (1/m) * t(X[, 2:NCOL(X)]) %*% (h-y) ) + grad_reg_term

return(list("cost" = J, "gradient" = grad))

}

initial_theta = matrix(0, 32, 1) #initialize parameters -- size is 32 x 1 (includes extra zero for the bias term)

cost_grad_list = reg_cost(initial_theta, X_train, y_train, 1) #calculate and store the initial cost & gradient (cost() returns them in a list)

model_cost = cost_grad_list$cost #extract the cost

#gradient is 32 x 1 (one for each theta parameter)

gradient = cost_grad_list$gradient #extract the gradient

#MAXIMIZE COST

#Performs regularized gradient ascent to maximize cost.

regGradientAscent = function(X, y, theta, alpha, num_iterations, lambda)

{

m = NROW(y) #number of training examples

J_history = matrix(0, num_iterations, 1) #initialize a variable to record cost at each iteration

for (iteration in 1:num_iterations)

{

reg_term = (lambda/m) * theta[2:NROW(theta)] #compute regularization term

theta[1] = theta[1] + ( (alpha/m) * t(X[, 1]) %*% (sigmoid(X %*% theta) - y) )

theta[2:NROW(theta)] = theta[2:NROW(theta)] + alpha * ( (1/m) * t(X[, 2:NCOL(X)]) %*% (sigmoid(X %*% theta) - y) + reg_term)

J_history[iteration, 1] = reg_cost(theta, X, y, 0)$cost #store the cost history to ensure that cost is actually decreasing

}

return(list("theta" = theta, "cost_history" = J_history))

}

result = regGradientAscent(X_train, y_train, initial_theta, 0.05, 400, 1) #perform gradient ascent to optimize parameters

theta = result$theta #extract parameters

cost_history = result$cost_history #extract cost history to check if cost is actually increasing each iteration

predict = function(X, theta) #Predicts labels using X as a feature matrix and theta as the corresponding parameter matrix

{

m = nrow(X) #save the number of training examples

predictions = matrix(0, m, 1) #initialize predictions

probabilities = X %*% theta #obtain probabilities

for (i in 1:m)

{

if (probabilities[i] <= 0) { predictions[i] = 1 }

}

return(predictions)

}

predictions = predict(X_test, theta) #predict on the test data

accuracy = mean(predictions == y_test) #compare our predictions to the actual values

print(cost_history)

print(accuracy)

validationCurve = function(X_train, y_train, Xval, yval)

{

lambda_vec = c(6, 5.5, 5, 4.5, 4, 3.5, 3, 2.5, 2, 1.5, 1, 0.5, 0.1, 0)

error_train = as.vector(rep(1, length(lambda_vec)))

error_val = as.vector(rep(1, length(lambda_vec)))

initial_theta = matrix(0, 32, 1)

for(i in 1:length(lambda_vec))

{

theta = regGradientAscent(X_train, y_train, initial_theta, 0.05, 400, lambda_vec[i])$theta

error_train[i] = reg_cost(theta, X_train, y_train, 0)$cost

error_val[i] = reg_cost(theta, Xval, yval, 0)$cost

}

return(list("lambda_vec" = lambda_vec, "error_train" = error_train, "error_val" = error_val))

}

#visualize lambda effects

learningCurveInfo = validationCurve(X_train, y_train, X_test, y_test)

lambda_vec = learningCurveInfo$lambda_vec

error_train = learningCurveInfo$error_train

error_val = learningCurveInfo$error_val

#plot effect of decreasing lambda

plot.new()

par(mfrow=c(1,2)) #conclusion from graphs: lambda best off as one

plot(lambda_vec, error_train, col = 'blue')

plot(lambda_vec, error_val, col = 'red')

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

Data Mining Concepts And Techniques

Authors: Jiawei Han, Micheline Kamber, Jian Pei

3rd Edition

0123814790, 9780123814791

More Books

Students also viewed these Databases questions

Question

a sin(2x) x Let f(x)=2x+1 In(be)

Answered: 1 week ago

Question

What is adverse impact? How can it be proved?

Answered: 1 week ago