Question
Need help with Logistic regression using gradient ascent algorithm on Wisconson Breast Cancer dataset ( uploaded at https://drive.google.com/file/d/1hoadOlDSvBa9T5nkeVGa9GRba3UDrQJZ/view?usp=drivesdk), the values should start from 0.08 to
Need help with Logistic regression using gradient ascent algorithm on Wisconson Breast Cancer dataset ( uploaded at https://drive.google.com/file/d/1hoadOlDSvBa9T5nkeVGa9GRba3UDrQJZ/view?usp=drivesdk), the values should start from 0.08 to 6 yet here it starts from 1.66 & thus ends at 0.549 with rest as Inf & NaN. Any help would be appreciaed.
Code -
cancerData = read.csv("breast-cancer-dataset.csv")
#DATA PREPARATION
X = as.matrix(cancerData[-c(2, 33)]) #X is 569 by 31 (segmented out the labels and a NA column)
X = cbind(as.vector(rep(1, 569)), X) #add a column of ones for intercept term -- X is now 569 by 32 (features matrix)
means_X = colMeans(X) #obtain mean of each feature
std_X = apply(X, 2, sd) #obtain standard deviation of each feature
#Apply mean normalization -- speeds up gradient descent and solves NaN issue
for (feature in 2:ncol(X)){ X[ , feature] = ( X[ , feature] - means_X[feature] ) / std_X[feature] }
labels = cancerData[2] #extract the labels we want to predict
numericLabels = ifelse(labels$diagnosis == 'M', 1, 0) #Malignant = 1, Benign = 0
y = as.matrix(numericLabels) #y is 569 by 1 (labels matrix)
set.seed(125)
sample = sample.int(n = nrow(X), size = floor(.75*nrow(X)), replace = F)
X_train = X[sample, ]
y_train = y[sample, ]
X_test = X[-sample, ]
y_test = y[-sample, ]
#COMPUTE COST AND GRADIENT
sigmoid = function(z)
{
g = 1/(1 + exp(-1 * z))
return(g)
}
cost = function(theta, X, y) #Cost function. Returns cost and gradients necessary to perform optimization.
{
m = NROW(y) #number of training examples
h = sigmoid(X %*% theta) #h is the hypothesis predictions.i.e,569 x 1.
J = (1/m) * ( t(y) %*% log(h) - t((1-y)) %*% log(1-h)) #Compute Cost
print(NROW(y))
grad = (1/m) * t(X) %*% (sigmoid(X %*% theta) - y) #Compute gradient
return(list("cost" = J, "gradient" = grad))
}
gradientAscent = function(X, y, theta, alpha, num_iterations) #Performs gradient ascent to maximize cost.
{
m = NROW(y) #number of training examples
J_history = matrix(0, num_iterations, 1) #initialize a variable to record cost at each iteration
for (iteration in 1:num_iterations) {
theta = theta + ( (alpha/m) * t(X) %*% (sigmoid(X %*% theta) - y) )
temp_cost = reg_cost(theta, X, y, 0)
J_history[iteration, 1] = temp_cost$cost
}
return(list("theta" = theta, "cost_history" = J_history))
}
reg_cost = function(theta, X, y, lambda) #Regularized cost function. Returns cost and gradients necessary to perform optimization.
{
m = NROW(y) #number of training examples
h = sigmoid(X %*% theta) #h is the hypothesis predictions.i.e, 569 x 1.
reg_term = (lambda/(2*m)) * sum( theta[2:NROW(theta)]^2 ) #compute regularization term
J = (1/m) * ( t(y) %*% log(h) - t((1-y)) %*% log(1-h)) + reg_term #Compute Cost
grad = as.vector(rep(0, 32)) #initialize gradient as 32 x 1 (same size as theta)
grad_reg_term = (lambda/m) * theta[2:NROW(theta)] #compute gradient regularization term
grad[1] = (1/m) * t(X[, 1]) %*% (h - y) #Compute gradient
grad[2:length(grad)] = ( (1/m) * t(X[, 2:NCOL(X)]) %*% (h-y) ) + grad_reg_term
return(list("cost" = J, "gradient" = grad))
}
initial_theta = matrix(0, 32, 1) #initialize parameters -- size is 32 x 1 (includes extra zero for the bias term)
cost_grad_list = reg_cost(initial_theta, X_train, y_train, 1) #calculate and store the initial cost & gradient (cost() returns them in a list)
model_cost = cost_grad_list$cost #extract the cost
#gradient is 32 x 1 (one for each theta parameter)
gradient = cost_grad_list$gradient #extract the gradient
#MAXIMIZE COST
#Performs regularized gradient ascent to maximize cost.
regGradientAscent = function(X, y, theta, alpha, num_iterations, lambda)
{
m = NROW(y) #number of training examples
J_history = matrix(0, num_iterations, 1) #initialize a variable to record cost at each iteration
for (iteration in 1:num_iterations)
{
reg_term = (lambda/m) * theta[2:NROW(theta)] #compute regularization term
theta[1] = theta[1] + ( (alpha/m) * t(X[, 1]) %*% (sigmoid(X %*% theta) - y) )
theta[2:NROW(theta)] = theta[2:NROW(theta)] + alpha * ( (1/m) * t(X[, 2:NCOL(X)]) %*% (sigmoid(X %*% theta) - y) + reg_term)
J_history[iteration, 1] = reg_cost(theta, X, y, 0)$cost #store the cost history to ensure that cost is actually decreasing
}
return(list("theta" = theta, "cost_history" = J_history))
}
result = regGradientAscent(X_train, y_train, initial_theta, 0.05, 400, 1) #perform gradient ascent to optimize parameters
theta = result$theta #extract parameters
cost_history = result$cost_history #extract cost history to check if cost is actually increasing each iteration
predict = function(X, theta) #Predicts labels using X as a feature matrix and theta as the corresponding parameter matrix
{
m = nrow(X) #save the number of training examples
predictions = matrix(0, m, 1) #initialize predictions
probabilities = X %*% theta #obtain probabilities
for (i in 1:m)
{
if (probabilities[i] <= 0) { predictions[i] = 1 }
}
return(predictions)
}
predictions = predict(X_test, theta) #predict on the test data
accuracy = mean(predictions == y_test) #compare our predictions to the actual values
print(cost_history)
print(accuracy)
validationCurve = function(X_train, y_train, Xval, yval)
{
lambda_vec = c(6, 5.5, 5, 4.5, 4, 3.5, 3, 2.5, 2, 1.5, 1, 0.5, 0.1, 0)
error_train = as.vector(rep(1, length(lambda_vec)))
error_val = as.vector(rep(1, length(lambda_vec)))
initial_theta = matrix(0, 32, 1)
for(i in 1:length(lambda_vec))
{
theta = regGradientAscent(X_train, y_train, initial_theta, 0.05, 400, lambda_vec[i])$theta
error_train[i] = reg_cost(theta, X_train, y_train, 0)$cost
error_val[i] = reg_cost(theta, Xval, yval, 0)$cost
}
return(list("lambda_vec" = lambda_vec, "error_train" = error_train, "error_val" = error_val))
}
#visualize lambda effects
learningCurveInfo = validationCurve(X_train, y_train, X_test, y_test)
lambda_vec = learningCurveInfo$lambda_vec
error_train = learningCurveInfo$error_train
error_val = learningCurveInfo$error_val
#plot effect of decreasing lambda
plot.new()
par(mfrow=c(1,2)) #conclusion from graphs: lambda best off as one
plot(lambda_vec, error_train, col = 'blue')
plot(lambda_vec, error_val, col = 'red')
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started